ProxMenux/AppImage/scripts/health_persistence.py

"""
Health Monitor Persistence Module
Manages persistent error tracking across AppImage updates using SQLite.
Stores errors in /usr/local/share/proxmenux/health_monitor.db
(same directory as monitor.db for temperature history)

Features:
- Persistent error storage (survives AppImage updates)
- Smart error resolution (auto-clear when VM starts, or after 48h)
- Event system for future Telegram notifications
- Manual acknowledgment support

Author: MacRimi
Version: 1.1
"""

import sqlite3
import json
import os
import re
import subprocess
import threading
from contextlib import contextmanager
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
from pathlib import Path

# `re` and `subprocess` are used in the SMART AUTO-RESOLVE block of
# `_cleanup_old_errors_impl` (qm/pct status calls + error_key parsing). They
# were not imported, so the entire auto-resolve loop hit NameError every 5
# minutes and got silently swallowed by the surrounding `except Exception:
# pass`. Audit Tier 5 (Health stack — imports faltantes).

import re as _re_disk_base


def disk_base_name(name):
    """Strip a partition suffix from a block device name, namespace-aware.

    The naive `re.sub(r'\\d+$', '', name)` was wrong for NVMe and MMC:
    - sda1         → sda          (correct)
    - nvme0n1      → nvme0n1      (already a base — its `n1` is the
                                   namespace, NOT a partition)
    - nvme0n1p1    → nvme0n1      (strip `pN` suffix)
    - mmcblk0p1    → mmcblk0
    - loop0p1      → loop0
    Audit Tier 7 — NVMe partitions regex.
    """
    if not isinstance(name, str) or not name:
        return name
    # Strip leading /dev/ if present so callers can pass either form.
    bare = name[len('/dev/'):] if name.startswith('/dev/') else name
    m = _re_disk_base.match(r'^(nvme\d+n\d+|mmcblk\d+|loop\d+)(?:p\d+)?$', bare)
    if m:
        return m.group(1)
    m = _re_disk_base.match(r'^([a-z]+)\d+$', bare)
    if m:
        return m.group(1)
    return bare


class HealthPersistence:
    """Manages persistent health error tracking"""

    # Default suppression duration when no user setting exists for a category.
    # Users override per-category via the Suppression Duration settings UI.
    DEFAULT_SUPPRESSION_HOURS = 24

    # Mapping from error categories to settings keys
    # `cpu` (cpu_usage in health_monitor.py:879/892) and `disk` (disk_space in
    # health_monitor.py:1240) were missing. Without them the per-category
    # suppression durations configured in the UI silently fall back to the
    # 24h default for those error types.
    CATEGORY_SETTING_MAP = {
        'temperature': 'suppress_cpu',
        'cpu': 'suppress_cpu',
        'memory': 'suppress_memory',
        'storage': 'suppress_storage',
        'disk': 'suppress_storage',
        'disks': 'suppress_disks',
        'network': 'suppress_network',
        'vms': 'suppress_vms',
        'pve_services': 'suppress_pve_services',
        'logs': 'suppress_logs',
        'updates': 'suppress_updates',
        'security': 'suppress_security',
    }

    def __init__(self):
        """Initialize persistence with database in shared ProxMenux data directory"""
        self.data_dir = Path('/usr/local/share/proxmenux')
        self.data_dir.mkdir(parents=True, exist_ok=True)

        self.db_path = self.data_dir / 'health_monitor.db'
        self._db_lock = threading.Lock()
        self._init_database()

    def _get_conn(self) -> sqlite3.Connection:
        """Get a SQLite connection with timeout and WAL mode for safe concurrency.

        IMPORTANT: Always close the connection when done, preferably using
        the _db_connection() context manager. If not closed explicitly,
        Python's GC will close it, but this is unreliable under load.
        """
        conn = sqlite3.connect(str(self.db_path), timeout=30)
        conn.execute('PRAGMA journal_mode=WAL')
        conn.execute('PRAGMA busy_timeout=10000')
        return conn

    @contextmanager
    def _db_connection(self, row_factory: bool = False):
        """Context manager for safe database connections (B4 fix).

        Ensures connections are always closed, even if exceptions occur.
        Usage:
            with self._db_connection() as conn:
                cursor = conn.cursor()
                ...
        """
        conn = self._get_conn()
        if row_factory:
            conn.row_factory = sqlite3.Row
        try:
            yield conn
        finally:
            conn.close()

    def _init_database(self):
        """Initialize SQLite database with required tables"""
        try:
            conn = self._get_conn()
            cursor = conn.cursor()
        except Exception as e:
            print(f"[HealthPersistence] CRITICAL: Failed to connect to database: {e}")
            return

        print(f"[HealthPersistence] Initializing database at {self.db_path}")

        # Errors table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS errors (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                error_key TEXT UNIQUE NOT NULL,
                category TEXT NOT NULL,
                severity TEXT NOT NULL,
                reason TEXT NOT NULL,
                details TEXT,
                first_seen TEXT NOT NULL,
                last_seen TEXT NOT NULL,
                resolved_at TEXT,
                resolution_type TEXT,
                resolution_reason TEXT,
                acknowledged INTEGER DEFAULT 0,
                acknowledged_at TEXT,
                notification_sent INTEGER DEFAULT 0,
                occurrence_count INTEGER DEFAULT 1,
                suppression_hours INTEGER DEFAULT 24
            )
        ''')

        # Events table (for future Telegram notifications)
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS events (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                event_type TEXT NOT NULL,
                error_key TEXT NOT NULL,
                timestamp TEXT NOT NULL,
                data TEXT
            )
        ''')

        # System capabilities table (detected once, cached forever)
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS system_capabilities (
                cap_key TEXT PRIMARY KEY,
                cap_value TEXT NOT NULL,
                detected_at TEXT NOT NULL
            )
        ''')

        # User settings table (per-category suppression durations, etc.)
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS user_settings (
                setting_key TEXT PRIMARY KEY,
                setting_value TEXT NOT NULL,
                updated_at TEXT NOT NULL
            )
        ''')

        # Notification history table (records all sent notifications)
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS notification_history (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                event_type TEXT NOT NULL,
                channel TEXT NOT NULL,
                title TEXT,
                message TEXT,
                severity TEXT,
                sent_at TEXT NOT NULL,
                success INTEGER DEFAULT 1,
                error_message TEXT,
                source TEXT DEFAULT 'server'
            )
        ''')

        # Notification cooldown persistence (survives restarts)
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS notification_last_sent (
                fingerprint TEXT PRIMARY KEY,
                last_sent_ts INTEGER NOT NULL,
                count INTEGER DEFAULT 1
            )
        ''')

        cursor.execute('''
            CREATE TABLE IF NOT EXISTS digest_pending (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                channel TEXT NOT NULL,
                event_type TEXT NOT NULL,
                event_group TEXT NOT NULL,
                severity TEXT NOT NULL,
                ts INTEGER NOT NULL,
                title TEXT NOT NULL,
                body TEXT NOT NULL
            )
        ''')
        cursor.execute(
            'CREATE INDEX IF NOT EXISTS idx_digest_pending_channel '
            'ON digest_pending(channel, ts)'
        )

        # Migration: add missing columns to errors table for existing DBs
        cursor.execute("PRAGMA table_info(errors)")
        columns = [col[1] for col in cursor.fetchall()]

        if 'suppression_hours' not in columns:
            cursor.execute('ALTER TABLE errors ADD COLUMN suppression_hours INTEGER DEFAULT 24')

        if 'acknowledged_at' not in columns:
            cursor.execute('ALTER TABLE errors ADD COLUMN acknowledged_at TEXT')

        if 'occurrence_count' not in columns:
            cursor.execute('ALTER TABLE errors ADD COLUMN occurrence_count INTEGER DEFAULT 1')

        if 'resolution_type' not in columns:
            cursor.execute('ALTER TABLE errors ADD COLUMN resolution_type TEXT')

        if 'resolution_reason' not in columns:
            cursor.execute('ALTER TABLE errors ADD COLUMN resolution_reason TEXT')

        # Indexes for performance
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_error_key ON errors(error_key)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON errors(category)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_resolved ON errors(resolved_at)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_events_error ON events(error_key)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_sent_at ON notification_history(sent_at)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_severity ON notification_history(severity)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_nls_ts ON notification_last_sent(last_sent_ts)')

        # ── Disk Observations System ──
        # Registry of all physical disks seen by the system
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS disk_registry (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                device_name TEXT NOT NULL,
                serial TEXT,
                model TEXT,
                size_bytes INTEGER,
                first_seen TEXT NOT NULL,
                last_seen TEXT NOT NULL,
                removed INTEGER DEFAULT 0,
                worst_health TEXT DEFAULT 'healthy',
                worst_health_date TEXT,
                admin_cleared TEXT,
                UNIQUE(device_name, serial)
            )
        ''')

        # Migration: add worst_health columns if they don't exist (for existing DBs)
        try:
            cursor.execute('ALTER TABLE disk_registry ADD COLUMN worst_health TEXT DEFAULT "healthy"')
        except Exception:
            pass
        try:
            cursor.execute('ALTER TABLE disk_registry ADD COLUMN worst_health_date TEXT')
        except Exception:
            pass
        try:
            cursor.execute('ALTER TABLE disk_registry ADD COLUMN admin_cleared TEXT')
        except Exception:
            pass

        # Observation log: deduplicated error events per disk
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS disk_observations (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                disk_registry_id INTEGER NOT NULL,
                error_type TEXT NOT NULL,
                error_signature TEXT NOT NULL,
                first_occurrence TEXT NOT NULL,
                last_occurrence TEXT NOT NULL,
                occurrence_count INTEGER DEFAULT 1,
                raw_message TEXT,
                severity TEXT DEFAULT 'warning',
                dismissed INTEGER DEFAULT 0,
                FOREIGN KEY(disk_registry_id) REFERENCES disk_registry(id),
                UNIQUE(disk_registry_id, error_type, error_signature)
            )
        ''')

        cursor.execute('CREATE INDEX IF NOT EXISTS idx_disk_serial ON disk_registry(serial)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_disk_device ON disk_registry(device_name)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_disk ON disk_observations(disk_registry_id)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_dismissed ON disk_observations(dismissed)')

        # Migration: ensure disk_observations has all required columns
        # Some older DBs may have different column names or missing columns
        cursor.execute('PRAGMA table_info(disk_observations)')
        obs_columns = [col[1] for col in cursor.fetchall()]

        # Add missing columns if needed (SQLite doesn't support RENAME COLUMN in older versions)
        if 'error_type' not in obs_columns and 'observation_type' in obs_columns:
            # Old schema had observation_type, but we'll work with it as-is
            pass  # The code should handle both column names

        if 'first_occurrence' not in obs_columns and 'first_seen' in obs_columns:
            # Old schema had first_seen/last_seen instead of first_occurrence/last_occurrence
            pass  # The code should handle both column names

        if 'occurrence_count' not in obs_columns:
            try:
                cursor.execute('ALTER TABLE disk_observations ADD COLUMN occurrence_count INTEGER DEFAULT 1')
            except Exception:
                pass

        if 'raw_message' not in obs_columns:
            try:
                cursor.execute('ALTER TABLE disk_observations ADD COLUMN raw_message TEXT')
            except Exception:
                pass

        if 'severity' not in obs_columns:
            try:
                cursor.execute('ALTER TABLE disk_observations ADD COLUMN severity TEXT DEFAULT "warning"')
            except Exception:
                pass

        if 'dismissed' not in obs_columns:
            try:
                cursor.execute('ALTER TABLE disk_observations ADD COLUMN dismissed INTEGER DEFAULT 0')
            except Exception:
                pass

        # ── Remote Storage Exclusions System ──
        # Allows users to permanently exclude remote storages (PBS, NFS, CIFS, etc.)
        # from health monitoring and notifications
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS excluded_storages (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                storage_name TEXT UNIQUE NOT NULL,
                storage_type TEXT NOT NULL,
                excluded_at TEXT NOT NULL,
                exclude_health INTEGER DEFAULT 1,
                exclude_notifications INTEGER DEFAULT 1,
                reason TEXT
            )
        ''')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_excluded_storage ON excluded_storages(storage_name)')

        # Table for excluded network interfaces - allows users to exclude interfaces
        # (like intentionally disabled bridges) from health monitoring and notifications
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS excluded_interfaces (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                interface_name TEXT UNIQUE NOT NULL,
                interface_type TEXT NOT NULL,
                excluded_at TEXT NOT NULL,
                exclude_health INTEGER DEFAULT 1,
                exclude_notifications INTEGER DEFAULT 1,
                reason TEXT
            )
        ''')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_excluded_interface ON excluded_interfaces(interface_name)')

        conn.commit()

        # Verify all required tables exist
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
        tables = {row[0] for row in cursor.fetchall()}
        required_tables = {'errors', 'events', 'system_capabilities', 'user_settings',
                          'notification_history', 'notification_last_sent',
                          'disk_registry', 'disk_observations',
                          'excluded_storages', 'excluded_interfaces'}
        missing = required_tables - tables
        if missing:
            print(f"[HealthPersistence] WARNING: Missing tables after init: {missing}")
        else:
            print(f"[HealthPersistence] Database initialized with {len(tables)} tables")

        # ─── Startup migration: clean stale errors from previous bug ───
        # Previous versions had a bug where journal-based errors were
        # re-processed every cycle, causing infinite notification loops.
        # The cleanup wipes any stale entries left over from that buggy
        # behaviour, but it must run **only once per upgrade**, not on every
        # restart. Otherwise a real, ongoing failure (a disk dying for two+
        # hours while the host is rebooted) loses its `first_seen` history
        # and looks "new" again on the next boot. Audit Tier 5 — Health stack.
        #
        # IMPORTANT: Only cleans the `errors` table (health monitor state).
        # The `disk_observations` table is a PERMANENT historical record
        # and must NEVER be auto-modified on startup. Users dismiss
        # observations manually from the disk detail UI.
        #
        # Covers: disk I/O (smart_*, disk_*), VM/CT (vm_*, ct_*, vmct_*),
        # and log errors (log_*) — all journal-sourced categories.
        _STARTUP_CLEANUP_VERSION = '1'
        try:
            cursor = conn.cursor()
            cursor.execute(
                'SELECT setting_value FROM user_settings WHERE setting_key = ?',
                ('startup_cleanup_version',)
            )
            row = cursor.fetchone()
            already_run = row and row[0] == _STARTUP_CLEANUP_VERSION

            if not already_run:
                cutoff = (datetime.now() - timedelta(hours=2)).isoformat()
                cursor.execute('''
                    DELETE FROM errors
                    WHERE (   error_key LIKE 'smart_%'
                           OR error_key LIKE 'disk_%'
                           OR error_key LIKE 'vm_%'
                           OR error_key LIKE 'ct_%'
                           OR error_key LIKE 'vmct_%'
                           OR error_key LIKE 'log_%'
                          )
                      AND resolved_at IS NULL
                      AND acknowledged = 0
                      AND last_seen < ?
                ''', (cutoff,))
                cleaned_errors = cursor.rowcount

                cursor.execute('''
                    INSERT OR REPLACE INTO user_settings
                        (setting_key, setting_value, updated_at)
                    VALUES (?, ?, ?)
                ''', ('startup_cleanup_version', _STARTUP_CLEANUP_VERSION,
                      datetime.now().isoformat()))

                conn.commit()
                if cleaned_errors > 0:
                    print(f"[HealthPersistence] One-time startup cleanup (v{_STARTUP_CLEANUP_VERSION}): "
                          f"removed {cleaned_errors} stale error(s) from health monitor")
        except Exception as e:
            print(f"[HealthPersistence] Startup cleanup warning: {e}")

        conn.close()

    def record_error(self, error_key: str, category: str, severity: str,
                    reason: str, details: Optional[Dict] = None) -> Dict[str, Any]:
        """
        Record or update an error.
        Returns event info (new_error, updated, etc.)
        """
        with self._db_lock:
            return self._record_error_impl(error_key, category, severity, reason, details)

    def _record_error_impl(self, error_key, category, severity, reason, details):
        # === RESOURCE EXISTENCE CHECK (before DB access) ===
        # Skip recording errors for resources that no longer exist
        if error_key and (error_key.startswith(('vm_', 'ct_', 'vmct_'))):
            import re
            vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', error_key)
            if vmid_match:
                vmid = vmid_match.group(1)
                if not self._check_vm_ct_exists(vmid):
                    return {'type': 'skipped', 'needs_notification': False,
                            'reason': f'VM/CT {vmid} no longer exists'}

        if error_key and any(error_key.startswith(p) for p in ('smart_', 'disk_', 'io_error_')):
            import re
            import os
            disk_match = re.search(r'(?:smart_|disk_fs_|disk_|io_error_)(?:/dev/)?([a-z]{2,4}[a-z0-9]*)', error_key)
            if disk_match:
                disk_name = disk_match.group(1)
                base_disk = disk_base_name(disk_name)
                if not os.path.exists(f'/dev/{disk_name}') and not os.path.exists(f'/dev/{base_disk}'):
                    return {'type': 'skipped', 'needs_notification': False,
                            'reason': f'Disk /dev/{disk_name} no longer exists'}

        conn = self._get_conn()
        try:
            cursor = conn.cursor()
            now = datetime.now().isoformat()
            details_json = json.dumps(details) if details else None

            cursor.execute('''
                SELECT id, acknowledged, resolved_at, category, severity, first_seen,
                       notification_sent, suppression_hours, acknowledged_at
                FROM errors WHERE error_key = ?
            ''', (error_key,))
            existing = cursor.fetchone()

            event_info = {'type': 'updated', 'needs_notification': False}

            if existing:
                (err_id, ack, resolved_at, old_cat, old_severity, first_seen,
                 notif_sent, stored_suppression, acknowledged_at) = existing

                if ack == 1:
                    # SAFETY OVERRIDE: Critical CPU temperature ALWAYS re-triggers
                    if error_key == 'cpu_temperature' and severity == 'CRITICAL':
                        cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
                        cursor.execute('''
                            INSERT INTO errors
                            (error_key, category, severity, reason, details, first_seen, last_seen)
                            VALUES (?, ?, ?, ?, ?, ?, ?)
                        ''', (error_key, category, severity, reason, details_json, now, now))
                        event_info = {'type': 'new', 'needs_notification': True}
                        self._record_event(cursor, 'new', error_key,
                                          {'severity': severity, 'reason': reason,
                                           'note': 'CRITICAL temperature override - safety alert'})
                        conn.commit()
                        return event_info

                    # Check suppression: use per-record stored hours (set at dismiss time)
                    sup_hours = stored_suppression if stored_suppression is not None else self.DEFAULT_SUPPRESSION_HOURS

                    # Permanent dismiss (sup_hours == -1): always suppress
                    if sup_hours == -1:
                        return {'type': 'skipped_acknowledged', 'needs_notification': False}

                    # Time-limited suppression. Prefer `acknowledged_at` as the
                    # reference time — that's what the user-dismiss path writes.
                    # `_acknowledge_error_impl` does NOT touch `resolved_at`, so
                    # falling through to the resolved_at-only check broke the
                    # dismiss for ALL non-journal categories (vms, services,
                    # cpu/memory, network, storage, security, updates): the
                    # detector re-fires every 5 min and the suppression window
                    # never starts. Audit Tier 5 (Health stack — `_record_error_impl`).
                    ref_time_str = acknowledged_at or resolved_at
                    still_suppressed = False
                    if ref_time_str:
                        try:
                            ref_dt = datetime.fromisoformat(ref_time_str)
                            elapsed_hours = (datetime.now() - ref_dt).total_seconds() / 3600
                            still_suppressed = elapsed_hours < sup_hours
                        except Exception:
                            pass

                    if still_suppressed:
                        return {'type': 'skipped_acknowledged', 'needs_notification': False}

                    # Suppression expired — re-trigger uniformly across categories.
                    # Previous code special-cased journal-sourced errors (logs/smart/
                    # disk/io_error) with a DELETE-without-INSERT workaround to dodge
                    # an infinite-notification loop. That loop was a symptom of the
                    # `acknowledged_at` bug fixed in Sprint 7.7 — without it,
                    # suppression never actually started and every cycle re-triggered.
                    # With suppression honoring acknowledged_at, the legitimate
                    # behavior is: when the window expires AND the underlying
                    # condition is still present in the journal, raise it once and
                    # let the user re-dismiss if they want.
                    cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
                    cursor.execute('''
                        INSERT INTO errors
                        (error_key, category, severity, reason, details, first_seen, last_seen)
                        VALUES (?, ?, ?, ?, ?, ?, ?)
                    ''', (error_key, category, severity, reason, details_json, now, now))
                    event_info = {'type': 'new', 'needs_notification': True}
                    self._record_event(cursor, 'new', error_key,
                                      {'severity': severity, 'reason': reason,
                                       'note': 'Re-triggered after suppression expired'})
                    conn.commit()
                    return event_info

                # Not acknowledged - update existing active error
                cursor.execute('''
                    UPDATE errors
                    SET last_seen = ?, severity = ?, reason = ?, details = ?
                    WHERE error_key = ? AND acknowledged = 0
                ''', (now, severity, reason, details_json, error_key))

                # Check if severity escalated
                if old_severity == 'WARNING' and severity == 'CRITICAL':
                    event_info['type'] = 'escalated'
                    event_info['needs_notification'] = True
            else:
                # Insert new error
                cursor.execute('''
                    INSERT INTO errors
                    (error_key, category, severity, reason, details, first_seen, last_seen)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                ''', (error_key, category, severity, reason, details_json, now, now))

                event_info['type'] = 'new'
                event_info['needs_notification'] = True

            # ─── Auto-suppress: if the category has a non-default setting, ───
            if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'):
                setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
                if setting_key:
                    stored = self._get_setting_impl(conn, setting_key)
                    if stored is not None:
                        configured_hours = int(stored)
                        if configured_hours != self.DEFAULT_SUPPRESSION_HOURS:
                            cursor.execute('''
                                UPDATE errors
                                SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
                                WHERE error_key = ? AND acknowledged = 0
                            ''', (now, configured_hours, error_key))

                            if cursor.rowcount > 0:
                                self._record_event(cursor, 'auto_suppressed', error_key, {
                                    'severity': severity,
                                    'reason': reason,
                                    'suppression_hours': configured_hours,
                                    'note': 'Auto-suppressed by user settings'
                                })
                                event_info['type'] = 'auto_suppressed'
                                event_info['needs_notification'] = False
                                conn.commit()
                                return event_info

            # Record event
            self._record_event(cursor, event_info['type'], error_key,
                              {'severity': severity, 'reason': reason})

            conn.commit()
        finally:
            conn.close()

        return event_info

    def resolve_error(self, error_key: str, reason: str = 'auto-resolved'):
        """Mark an error as resolved"""
        with self._db_lock:
            return self._resolve_error_impl(error_key, reason)

    def _resolve_error_impl(self, error_key, reason):
        with self._db_connection() as conn:
            cursor = conn.cursor()
            now = datetime.now().isoformat()

            cursor.execute('''
                UPDATE errors
                SET resolved_at = ?
                WHERE error_key = ? AND resolved_at IS NULL
            ''', (now, error_key))

            if cursor.rowcount > 0:
                self._record_event(cursor, 'resolved', error_key, {'reason': reason})

            conn.commit()

    def is_error_active(self, error_key: str, category: Optional[str] = None) -> bool:
        """
        Check if an error is currently active OR suppressed (dismissed but within suppression period).
        Used by checks to avoid re-recording errors that are already tracked or dismissed.

        Returns True if:
        - Error is active (unresolved and not acknowledged), OR
        - Error is dismissed but still within its suppression period
        """
        with self._db_connection() as conn:
            cursor = conn.cursor()

            # First check: is the error active (unresolved and not acknowledged)?
            if category:
                cursor.execute('''
                    SELECT COUNT(*) FROM errors
                    WHERE error_key = ? AND category = ?
                      AND resolved_at IS NULL AND acknowledged = 0
                ''', (error_key, category))
            else:
                cursor.execute('''
                    SELECT COUNT(*) FROM errors
                    WHERE error_key = ?
                      AND resolved_at IS NULL AND acknowledged = 0
                ''', (error_key,))

            active_count = cursor.fetchone()[0]
            if active_count > 0:
                return True

            # Second check: is the error dismissed but still within suppression period?
            # This prevents re-recording dismissed errors before their suppression expires
            # Note: acknowledged errors may have resolved_at NULL (dismissed but error still exists)
            # or resolved_at set (error was dismissed AND condition resolved)
            if category:
                cursor.execute('''
                    SELECT acknowledged_at, suppression_hours FROM errors
                    WHERE error_key = ? AND category = ?
                      AND acknowledged = 1
                    ORDER BY acknowledged_at DESC LIMIT 1
                ''', (error_key, category))
            else:
                cursor.execute('''
                    SELECT acknowledged_at, suppression_hours FROM errors
                    WHERE error_key = ?
                      AND acknowledged = 1
                    ORDER BY acknowledged_at DESC LIMIT 1
                ''', (error_key,))

            row = cursor.fetchone()

        if row:
            acknowledged_at_str, suppression_hours = row
            if acknowledged_at_str and suppression_hours:
                try:
                    acknowledged_at = datetime.fromisoformat(acknowledged_at_str)
                    suppression_end = acknowledged_at + timedelta(hours=suppression_hours)
                    if datetime.now() < suppression_end:
                        # Still within suppression period - treat as "active" to prevent re-recording
                        return True
                except (ValueError, TypeError):
                    pass

        return False

    def clear_error(self, error_key: str):
        """
        Remove/resolve a specific error immediately.
        Used when the condition that caused the error no longer exists
        (e.g., storage became available again, CPU temp recovered).

        For acknowledged errors: if the condition resolved on its own,
        we delete the record entirely so it can re-trigger as a fresh
        event if the condition returns later.

        Acquires `_db_lock` to serialize against concurrent record/cleanup
        writes — without it, SQLite's WAL still serializes the actual write,
        but read-modify-write sequences (the SELECT acknowledged + DELETE/UPDATE
        pair below) could race with another thread mutating the same row in
        between. Audit Tier 5 (Health stack — race conditions sin _db_lock).
        """
        with self._db_lock, self._db_connection() as conn:
            cursor = conn.cursor()
            now = datetime.now().isoformat()

            # Check if this error was acknowledged (dismissed)
            cursor.execute('''
                SELECT acknowledged FROM errors WHERE error_key = ?
            ''', (error_key,))
            row = cursor.fetchone()

            if row and row[0] == 1:
                # Dismissed error that naturally resolved - delete entirely
                cursor.execute('DELETE FROM errors WHERE error_key = ?', (error_key,))
                if cursor.rowcount > 0:
                    self._record_event(cursor, 'cleared', error_key,
                                      {'reason': 'condition_resolved_after_dismiss'})
            else:
                # Normal active error - mark as resolved
                cursor.execute('''
                    UPDATE errors
                    SET resolved_at = ?
                    WHERE error_key = ? AND resolved_at IS NULL
                ''', (now, error_key))

                if cursor.rowcount > 0:
                    self._record_event(cursor, 'cleared', error_key, {'reason': 'condition_resolved'})

            conn.commit()

    def acknowledge_error(self, error_key: str) -> Dict[str, Any]:
        """
        Manually acknowledge an error (dismiss).
        - Looks up the category's configured suppression duration from user settings
        - Stores suppression_hours on the error record (snapshot at dismiss time)
        - Marks as acknowledged so it won't re-appear during the suppression period
        """
        with self._db_lock:
            return self._acknowledge_error_impl(error_key)

    def _acknowledge_error_impl(self, error_key):
        conn = self._get_conn()
        conn.row_factory = sqlite3.Row
        category = ''
        sup_hours = self.DEFAULT_SUPPRESSION_HOURS
        try:
            cursor = conn.cursor()
            now = datetime.now().isoformat()

            # Get current error info before acknowledging
            cursor.execute('SELECT * FROM errors WHERE error_key = ?', (error_key,))
            row = cursor.fetchone()

            result = {'success': False, 'error_key': error_key}

            if not row:
                # Error not in DB yet -- create a minimal record so the dismiss persists.
                # Try to infer category from the error_key prefix.
                category = ''
                # Order matters: more specific prefixes MUST come before shorter ones
                # e.g. 'security_updates' (updates) before 'security_' (security)
                for cat, prefix in [('updates', 'security_updates'), ('updates', 'system_age'),
                                    ('updates', 'pending_updates'), ('updates', 'kernel_pve'),
                                    ('security', 'security_'),
                                    ('pve_services', 'pve_service_'), ('vms', 'vmct_'), ('vms', 'vm_'), ('vms', 'ct_'),
                                    ('disks', 'disk_smart_'), ('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'),
                                    ('logs', 'log_'), ('network', 'net_'),
                                    ('temperature', 'temp_')]:
                    if error_key == prefix or error_key.startswith(prefix):
                        category = cat
                        break

                # Fallback: if no category matched, try to infer from common patterns
                if not category:
                    if 'disk' in error_key or 'smart' in error_key or 'sda' in error_key or 'sdb' in error_key or 'nvme' in error_key:
                        category = 'disks'
                    else:
                        category = 'general'

                setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
                sup_hours = self.DEFAULT_SUPPRESSION_HOURS
                if setting_key:
                    stored = self._get_setting_impl(conn, setting_key)
                    if stored is not None:
                        try:
                            sup_hours = int(stored)
                        except (ValueError, TypeError):
                            pass

                # Insert as acknowledged but NOT resolved - error remains active
                cursor.execute('''
                    INSERT INTO errors (error_key, category, severity, reason, first_seen, last_seen,
                                        occurrence_count, acknowledged, acknowledged_at, suppression_hours)
                    VALUES (?, ?, 'WARNING', 'Dismissed by user', ?, ?, 1, 1, ?, ?)
                ''', (error_key, category, now, now, now, sup_hours))

                self._record_event(cursor, 'acknowledged', error_key, {
                    'original_severity': 'WARNING',
                    'category': category,
                    'suppression_hours': sup_hours
                })

                result = {
                    'success': True,
                    'error_key': error_key,
                    'original_severity': 'WARNING',
                    'category': category,
                    'suppression_hours': sup_hours,
                    'acknowledged_at': now
                }
                conn.commit()
                return result

            if row:
                error_dict = dict(row)
                original_severity = error_dict.get('severity', 'WARNING')
                category = error_dict.get('category', '')

                # Look up the user's configured suppression for this category
                setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
                sup_hours = self.DEFAULT_SUPPRESSION_HOURS
                if setting_key:
                    stored = self._get_setting_impl(conn, setting_key)
                    if stored is not None:
                        try:
                            sup_hours = int(stored)
                        except (ValueError, TypeError):
                            pass

                # Mark as acknowledged but DO NOT set resolved_at
                cursor.execute('''
                    UPDATE errors
                    SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
                    WHERE error_key = ?
                ''', (now, sup_hours, error_key))

                self._record_event(cursor, 'acknowledged', error_key, {
                    'original_severity': original_severity,
                    'category': category,
                    'suppression_hours': sup_hours
                })

                # Cascade acknowledge: when dismissing a group check, also
                # silence the individual children that compose it. Without
                # this, dismissing the aggregate ("an avalanche of log errors")
                # left the per-pattern children active and notifying separately.
                # `log_error_cascade` and `log_error_spike` both group children
                # of the form `log_critical_<hash>` (see _check_logs_with_persistence).
                CASCADE_PREFIXES = {
                    'log_persistent_errors': 'log_persistent_',
                    'log_error_cascade': 'log_critical_',
                    'log_error_spike': 'log_critical_',
                }
                child_prefix = CASCADE_PREFIXES.get(error_key)
                if child_prefix:
                    cursor.execute('''
                        UPDATE errors
                        SET acknowledged = 1, acknowledged_at = ?, suppression_hours = ?
                        WHERE error_key LIKE ? AND acknowledged = 0 AND resolved_at IS NULL
                    ''', (now, sup_hours, child_prefix + '%'))

                result = {
                    'success': True,
                    'error_key': error_key,
                    'original_severity': original_severity,
                    'category': category,
                    'acknowledged_at': now,
                    'suppression_hours': sup_hours
                }

            conn.commit()
        finally:
            conn.close()

        # ── Coordinate with notification cooldowns ──
        if sup_hours != -1:
            if category == 'disks':
                self._clear_disk_io_cooldown(error_key)
            else:
                self._clear_notification_cooldown(error_key)

        return result

    def is_error_acknowledged(self, error_key: str) -> bool:
        """Check if an error_key has been acknowledged and is still within suppression window.

        Uses acknowledged_at (not resolved_at) to calculate suppression expiration,
        since dismissed errors may have resolved_at = NULL.
        """
        try:
            with self._db_connection(row_factory=True) as conn:
                cursor = conn.cursor()
                cursor.execute(
                    'SELECT acknowledged, acknowledged_at, suppression_hours FROM errors WHERE error_key = ?',
                    (error_key,))
                row = cursor.fetchone()
                if not row:
                    return False
                if not row['acknowledged']:
                    return False
                # Check if still within suppression window using acknowledged_at
                acknowledged_at = row['acknowledged_at']
                sup_hours = row['suppression_hours'] or self.DEFAULT_SUPPRESSION_HOURS

                # -1 means permanently suppressed
                if sup_hours < 0:
                    return True

                if acknowledged_at:
                    try:
                        acknowledged_dt = datetime.fromisoformat(acknowledged_at)
                        if datetime.now() > acknowledged_dt + timedelta(hours=sup_hours):
                            return False  # Suppression expired
                    except Exception:
                        pass
                return True
        except Exception:
            return False

    def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]:
        """Get all active (unresolved AND not acknowledged) errors, optionally filtered by category.

        Acknowledged errors are excluded since they have been dismissed by the user.
        """
        with self._db_connection(row_factory=True) as conn:
            cursor = conn.cursor()

            if category:
                cursor.execute('''
                    SELECT * FROM errors
                    WHERE resolved_at IS NULL AND acknowledged = 0 AND category = ?
                    ORDER BY severity DESC, last_seen DESC
                ''', (category,))
            else:
                cursor.execute('''
                    SELECT * FROM errors
                    WHERE resolved_at IS NULL AND acknowledged = 0
                    ORDER BY severity DESC, last_seen DESC
                ''')

            rows = cursor.fetchall()

        errors = []
        for row in rows:
            error_dict = dict(row)
            if error_dict.get('details'):
                error_dict['details'] = json.loads(error_dict['details'])
            errors.append(error_dict)

        return errors

    def get_error_by_key(self, error_key: str) -> Optional[Dict[str, Any]]:
        """Get a single error record by its unique error_key.

        Returns the full row as a dict (including first_seen, last_seen,
        acknowledged, etc.) or None if not found / already resolved.
        Only returns unresolved (active) errors.
        """
        conn = self._get_conn()
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        cursor.execute('''
            SELECT * FROM errors
            WHERE error_key = ? AND resolved_at IS NULL
            LIMIT 1
        ''', (error_key,))
        row = cursor.fetchone()
        conn.close()
        if row is None:
            return None
        error_dict = dict(row)
        if error_dict.get('details'):
            try:
                error_dict['details'] = json.loads(error_dict['details'])
            except (json.JSONDecodeError, TypeError):
                pass
        return error_dict

    def cleanup_old_errors(self):
        """Clean up old resolved errors and auto-resolve stale errors"""
        with self._db_lock:
            return self._cleanup_old_errors_impl()

    def _cleanup_old_errors_impl(self):
        conn = self._get_conn()
        try:
            cursor = conn.cursor()

            now = datetime.now()
            now_iso = now.isoformat()

            # Delete resolved errors older than 7 days
            cutoff_resolved = (now - timedelta(days=7)).isoformat()
            cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,))

            # ── Auto-resolve stale errors using Suppression Duration settings ──
            user_settings = {}
            try:
                cursor.execute(
                    'SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?',
                    ('suppress_%',)
                )
                for row in cursor.fetchall():
                    user_settings[row[0]] = row[1]
            except Exception:
                pass

            for category, setting_key in self.CATEGORY_SETTING_MAP.items():
                stored = user_settings.get(setting_key)
                try:
                    hours = int(stored) if stored else self.DEFAULT_SUPPRESSION_HOURS
                except (ValueError, TypeError):
                    hours = self.DEFAULT_SUPPRESSION_HOURS

                if hours < 0:
                    continue

                cutoff = (now - timedelta(hours=hours)).isoformat()
                cursor.execute('''
                    UPDATE errors
                    SET resolved_at = ?
                    WHERE category = ?
                      AND resolved_at IS NULL
                      AND last_seen < ?
                      AND acknowledged = 0
                ''', (now_iso, category, cutoff))

            # Catch-all: auto-resolve any error from an unmapped category
            fallback_cutoff = (now - timedelta(hours=self.DEFAULT_SUPPRESSION_HOURS)).isoformat()
            cursor.execute('''
                UPDATE errors
                SET resolved_at = ?
                WHERE resolved_at IS NULL
                  AND acknowledged = 0
                  AND last_seen < ?
            ''', (now_iso, fallback_cutoff))

            # Delete old events (>30 days)
            cutoff_events = (now - timedelta(days=30)).isoformat()
            cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))

            # ── SMART AUTO-RESOLVE: Based on system state ──
            try:
                import psutil
                with open('/proc/uptime', 'r') as f:
                    uptime_seconds = float(f.read().split()[0])

                if uptime_seconds > 600:
                    current_cpu = psutil.cpu_percent(interval=0.1)
                    current_mem = psutil.virtual_memory().percent

                    # 1. LOGS: Auto-resolve if not seen in 15 minutes
                    stale_logs_cutoff = (now - timedelta(minutes=15)).isoformat()
                    cursor.execute('''
                        UPDATE errors SET resolved_at = ?
                        WHERE category = 'logs' AND resolved_at IS NULL
                          AND acknowledged = 0 AND last_seen < ?
                    ''', (now_iso, stale_logs_cutoff))

                    # 2. CPU: Auto-resolve if CPU is normal (<75%)
                    if current_cpu < 75:
                        stale_cpu_cutoff = (now - timedelta(minutes=5)).isoformat()
                        cursor.execute('''
                            UPDATE errors SET resolved_at = ?
                            WHERE (category = 'cpu' OR category = 'temperature')
                              AND resolved_at IS NULL AND acknowledged = 0
                              AND last_seen < ?
                              AND (error_key LIKE 'cpu_%' OR reason LIKE '%CPU%')
                        ''', (now_iso, stale_cpu_cutoff))

                    # 3. MEMORY: Auto-resolve if memory is normal (<80%)
                    if current_mem < 80:
                        stale_mem_cutoff = (now - timedelta(minutes=5)).isoformat()
                        cursor.execute('''
                            UPDATE errors SET resolved_at = ?
                            WHERE (category = 'memory' OR category = 'logs')
                              AND resolved_at IS NULL AND acknowledged = 0
                              AND last_seen < ?
                              AND (error_key LIKE '%oom%' OR error_key LIKE '%memory%'
                                   OR reason LIKE '%memory%' OR reason LIKE '%OOM%'
                                   OR reason LIKE '%killed%process%')
                        ''', (now_iso, stale_mem_cutoff))

                    # 4. VMS: Auto-resolve if VM/CT is now running or deleted
                    cursor.execute('''
                        SELECT error_key, category, reason FROM errors
                        WHERE (category IN ('vms', 'vmct') OR error_key LIKE 'vm_%'
                               OR error_key LIKE 'ct_%' OR error_key LIKE 'vmct_%')
                          AND resolved_at IS NULL AND acknowledged = 0
                    ''')
                    vm_errors = cursor.fetchall()
                    for vm_ek, cat, vm_reason in vm_errors:
                        vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', vm_ek)
                        if vmid_match:
                            vmid = vmid_match.group(1)
                            try:
                                vm_running = False
                                ct_running = False
                                vm_exists = False
                                ct_exists = False

                                result_vm = subprocess.run(
                                    ['qm', 'status', vmid],
                                    capture_output=True, text=True, timeout=2)
                                if result_vm.returncode == 0:
                                    vm_exists = True
                                    vm_running = 'running' in result_vm.stdout.lower()

                                if not vm_exists:
                                    result_ct = subprocess.run(
                                        ['pct', 'status', vmid],
                                        capture_output=True, text=True, timeout=2)
                                    if result_ct.returncode == 0:
                                        ct_exists = True
                                        ct_running = 'running' in result_ct.stdout.lower()

                                if not vm_exists and not ct_exists:
                                    cursor.execute('''
                                        UPDATE errors SET resolved_at = ?
                                        WHERE error_key = ? AND resolved_at IS NULL
                                    ''', (now_iso, vm_ek))
                                elif vm_running or ct_running:
                                    reason_lower = (vm_reason or '').lower()
                                    is_persistent = any(x in reason_lower for x in [
                                        'device', 'missing', 'does not exist', 'permission',
                                        'not found', 'no such', 'invalid'])
                                    if not is_persistent:
                                        cursor.execute('''
                                            UPDATE errors SET resolved_at = ?
                                            WHERE error_key = ? AND resolved_at IS NULL
                                        ''', (now_iso, vm_ek))
                            except Exception:
                                pass

                    # 5. GENERIC: Any error not seen in 30 min while system is healthy
                    if current_cpu < 80 and current_mem < 85:
                        stale_generic_cutoff = (now - timedelta(minutes=30)).isoformat()
                        cursor.execute('''
                            UPDATE errors SET resolved_at = ?
                            WHERE resolved_at IS NULL AND acknowledged = 0
                              AND last_seen < ?
                              AND category NOT IN ('disks', 'storage')
                        ''', (now_iso, stale_generic_cutoff))

            except Exception:
                pass  # If we can't read uptime, skip this cleanup

            conn.commit()
        finally:
            conn.close()

        # Clean up errors for resources that no longer exist (VMs/CTs deleted, disks removed)
        self._cleanup_stale_resources()

        # NOTE: cleanup_orphan_observations() is deliberately NOT invoked here.
        # Running it on the 5-minute auto-resolve cycle silently dismissed legitimate
        # observations (ZFS pool errors, ATA host events, dm-* aliases) before the user
        # could see them in the UI history, even though notifications were already sent.
        # The cleanup is still available as an explicit user action via
        # POST /api/health/cleanup-disconnected-disks (flask_health_routes.py).

    def _cleanup_stale_resources(self):
        """Resolve errors for resources that no longer exist.

        Comprehensive cleanup for ALL error categories:
        - VMs/CTs: deleted resources (not just stopped)
        - Disks: physically removed devices, ZFS pools, storage
        - Network: removed interfaces, bonds, bridges
        - Services/pve_services: services on deleted CTs, stopped services
        - Logs: persistent/spike/cascade errors older than 48h
        - Cluster: errors when node is no longer in cluster
        - Temperature: sensors that no longer exist
        - Memory/Storage: mount points that no longer exist
        - Updates/Security: acknowledged errors older than 7 days
        - General fallback: any error older than 7 days with no recent activity
        """
        import subprocess
        import re

        conn = self._get_conn()
        cursor = conn.cursor()
        now = datetime.now()
        now_iso = now.isoformat()

        # Get all active (unresolved) errors with first_seen and last_seen for age checks
        # An error is considered unresolved if resolution_type is NULL or empty
        # (resolved_at alone is not sufficient - it may be in an inconsistent state)
        cursor.execute('''
            SELECT id, error_key, category, reason, first_seen, last_seen, severity FROM errors
            WHERE resolution_type IS NULL OR resolution_type = ''
        ''')
        active_errors = cursor.fetchall()
        resolved_count = 0

        # Cache for expensive checks (avoid repeated subprocess calls)
        _vm_ct_exists_cache = {}
        _cluster_status_cache = None
        _network_interfaces_cache = None
        _zfs_pools_cache = None
        _mount_points_cache = None
        _pve_services_cache = None

        def check_vm_ct_cached(vmid):
            if vmid not in _vm_ct_exists_cache:
                _vm_ct_exists_cache[vmid] = self._check_vm_ct_exists(vmid)
            return _vm_ct_exists_cache[vmid]

        def get_cluster_status():
            nonlocal _cluster_status_cache
            if _cluster_status_cache is None:
                # Primary signal: presence of `/etc/corosync/corosync.conf`.
                # That file only exists on clustered nodes and is the same
                # check `health_monitor._check_pve_services` uses for the
                # corosync gate. Substring match on "Cluster information"
                # was fragile against locale/translations and PVE upgrades
                # renaming the header. Audit Tier 6 — `_cleanup_stale_resources::get_cluster_status`.
                is_cluster = os.path.isfile('/etc/corosync/corosync.conf')
                nodes_text = ''
                try:
                    result = subprocess.run(
                        ['pvecm', 'status'],
                        capture_output=True, text=True, timeout=5
                    )
                    if result.returncode == 0:
                        nodes_text = result.stdout
                        # Confirm via any of multiple section markers that
                        # appear on real cluster nodes, not just one.
                        if not is_cluster:
                            stdout_l = nodes_text.lower()
                            is_cluster = any(
                                marker in stdout_l
                                for marker in ('cluster information',
                                               'quorum information',
                                               'membership information')
                            )
                except Exception:
                    # On error, fall back to corosync.conf signal alone.
                    pass
                _cluster_status_cache = {
                    'is_cluster': is_cluster,
                    'nodes': nodes_text,
                }
            return _cluster_status_cache

        def get_network_interfaces():
            nonlocal _network_interfaces_cache
            if _network_interfaces_cache is None:
                try:
                    import psutil
                    _network_interfaces_cache = set(psutil.net_if_stats().keys())
                except Exception:
                    _network_interfaces_cache = set()
            return _network_interfaces_cache

        def get_zfs_pools():
            nonlocal _zfs_pools_cache
            if _zfs_pools_cache is None:
                try:
                    result = subprocess.run(
                        ['zpool', 'list', '-H', '-o', 'name'],
                        capture_output=True, text=True, timeout=5
                    )
                    if result.returncode == 0:
                        _zfs_pools_cache = set(result.stdout.strip().split('\n'))
                    else:
                        _zfs_pools_cache = set()
                except Exception:
                    _zfs_pools_cache = set()
            return _zfs_pools_cache

        def get_mount_points():
            nonlocal _mount_points_cache
            if _mount_points_cache is None:
                try:
                    import psutil
                    _mount_points_cache = set(p.mountpoint for p in psutil.disk_partitions(all=True))
                except Exception:
                    _mount_points_cache = set()
            return _mount_points_cache

        def get_pve_services_status():
            nonlocal _pve_services_cache
            if _pve_services_cache is None:
                _pve_services_cache = {}
                try:
                    result = subprocess.run(
                        ['systemctl', 'list-units', '--type=service', '--all', '--no-legend'],
                        capture_output=True, text=True, timeout=10
                    )
                    if result.returncode == 0:
                        for line in result.stdout.strip().split('\n'):
                            parts = line.split()
                            if parts:
                                service_name = parts[0].replace('.service', '')
                                _pve_services_cache[service_name] = 'active' in line
                except Exception:
                    pass
            return _pve_services_cache

        def extract_vmid_from_text(text):
            """Extract VM/CT ID from error message or key."""
            if not text:
                return None
            # Patterns: "VM 100", "CT 100", "vm_100_", "ct_100_", "VMID 100", "VM/CT 100", "qemu/100", "lxc/100", etc.
            patterns = [
                r'(?:VM|CT|VMID|CTID|vm_|ct_|vmct_)[\s_]?(\d{3,})',  # VM 100, ct_100
                r'VM/CT[\s_]?(\d{3,})',                               # VM/CT 100
                r'(?:qemu|lxc)[/\\](\d{3,})',                         # qemu/100, lxc/100
                r'process.*kvm.*?(\d{3,})',                           # process kvm with vmid
                r'Failed to start.*?(\d{3,})',                        # Failed to start VM/CT
                r'starting.*?(\d{3,}).*failed',                       # starting 100 failed
            ]
            for pattern in patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    return match.group(1)
            return None

        def get_age_hours(timestamp_str):
            """Get age in hours from ISO timestamp string."""
            if not timestamp_str:
                return 0
            try:
                dt = datetime.fromisoformat(timestamp_str)
                return (now - dt).total_seconds() / 3600
            except (ValueError, TypeError):
                return 0

        for error_row in active_errors:
            err_id, error_key, category, reason, first_seen, last_seen, severity = error_row
            should_resolve = False
            resolution_reason = None
            age_hours = get_age_hours(first_seen)
            last_seen_hours = get_age_hours(last_seen)

            # === VM/CT ERRORS ===
            # Only attempt VMID resolution when the error context is actually VM/CT-related.
            # The loose regex patterns in extract_vmid_from_text (kvm/Failed to start/starting...failed)
            # otherwise match any 3+ digit number in unrelated disk/network/service messages, and the
            # if/elif chain below would short-circuit the legitimate category-specific check.
            is_vm_ct_context = (
                category in ('vms', 'vmct') or
                (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_') or error_key.startswith('vmct_')))
            )
            vmid = None
            if is_vm_ct_context:
                vmid_from_key = extract_vmid_from_text(error_key) if error_key else None
                vmid_from_reason = extract_vmid_from_text(reason) if reason else None
                vmid = vmid_from_key or vmid_from_reason

            if is_vm_ct_context and vmid and not check_vm_ct_cached(vmid):
                should_resolve = True
                resolution_reason = f'VM/CT {vmid} deleted'
            elif is_vm_ct_context:
                # VM/CT context but ID couldn't be extracted - resolve if stale
                if not vmid and last_seen_hours > 1:
                    should_resolve = True
                    resolution_reason = 'VM/CT error stale (>1h, ID not found)'

            # === DISK ERRORS ===
            # Check if disk device or ZFS pool still exists
            elif category == 'disks' or category == 'storage':
                if error_key:
                    # Check for ZFS pool errors (e.g., "zfs_pool_rpool_degraded")
                    zfs_match = re.search(r'zfs_(?:pool_)?([a-zA-Z0-9_-]+)', error_key)
                    if zfs_match:
                        pool_name = zfs_match.group(1)
                        pools = get_zfs_pools()
                        if pools and pool_name not in pools:
                            should_resolve = True
                            resolution_reason = 'ZFS pool removed'

                    # Check for disk device errors (e.g., "disk_sdh_io_error", "smart_sda_failing", "disk_fs_sdb1")
                    if not should_resolve:
                        # Match patterns like: smart_sda, disk_sdb, io_error_nvme0n1, disk_fs_sdb1
                        disk_match = re.search(r'(?:disk_fs_|disk_|smart_|io_error_)(?:/dev/)?([a-z]{2,4}[a-z0-9]*)', error_key)
                        if disk_match:
                            disk_name = disk_match.group(1)
                            # Remove partition number for base device check
                            base_disk = disk_base_name(disk_name)
                            disk_path = f'/dev/{disk_name}'
                            base_path = f'/dev/{base_disk}'
                            if not os.path.exists(disk_path) and not os.path.exists(base_path):
                                should_resolve = True
                                resolution_reason = 'Disk device removed'

                    # Check for mount point errors (e.g., "disk_fs_/mnt/data")
                    if not should_resolve and 'disk_fs_' in error_key:
                        mount = error_key.replace('disk_fs_', '').split('_')[0]
                        if mount.startswith('/'):
                            mounts = get_mount_points()
                            if mounts and mount not in mounts:
                                should_resolve = True
                                resolution_reason = 'Mount point removed'

            # === NETWORK ERRORS ===
            # Check if network interface still exists
            elif category == 'network':
                if error_key:
                    # Extract interface name (e.g., "net_vmbr1_down" -> "vmbr1", "bond0_slave_error" -> "bond0")
                    iface_match = re.search(r'(?:net_|bond_|vmbr|eth|eno|ens|enp)([a-zA-Z0-9_]+)?', error_key)
                    if iface_match:
                        # Reconstruct full interface name
                        full_match = re.search(r'((?:vmbr|bond|eth|eno|ens|enp)[a-zA-Z0-9]+)', error_key)
                        if full_match:
                            iface = full_match.group(1)
                            interfaces = get_network_interfaces()
                            if interfaces and iface not in interfaces:
                                should_resolve = True
                                resolution_reason = 'Network interface removed'

            # === SERVICE ERRORS ===
            # Check if service exists or if it references a deleted CT
            elif category in ('services', 'pve_services'):
                # First check if it references a CT that no longer exists
                vmid = extract_vmid_from_text(reason) or extract_vmid_from_text(error_key)
                if vmid and not check_vm_ct_cached(vmid):
                    should_resolve = True
                    resolution_reason = 'Container deleted'

                # For pve_services, check if the service unit exists
                if not should_resolve and category == 'pve_services' and error_key:
                    service_match = re.search(r'service_([a-zA-Z0-9_-]+)', error_key)
                    if service_match:
                        service_name = service_match.group(1)
                        services = get_pve_services_status()
                        if services and service_name not in services:
                            should_resolve = True
                            resolution_reason = 'Service no longer exists'

            # === LOG ERRORS ===
            # Auto-resolve log errors after 48h (they represent point-in-time issues)
            elif category == 'logs' or (error_key and error_key.startswith(('log_persistent_', 'log_spike_', 'log_cascade_', 'log_critical_'))):
                if age_hours > 48:
                    should_resolve = True
                    resolution_reason = 'Log error aged out (>48h)'

            # === CLUSTER ERRORS ===
            # Resolve cluster/corosync/qdevice errors if node is no longer in a cluster
            # Check both error_key and reason for cluster-related keywords
            cluster_keywords = ('cluster', 'corosync', 'qdevice', 'quorum', 'cman', 'pacemaker')
            is_cluster_error = (
                (error_key and any(x in error_key.lower() for x in cluster_keywords)) or
                (reason and any(x in reason.lower() for x in cluster_keywords))
            )
            if is_cluster_error:
                cluster_info = get_cluster_status()
                if not cluster_info['is_cluster']:
                    should_resolve = True
                    resolution_reason = 'No longer in cluster'

            # === TEMPERATURE ERRORS ===
            # Temperature errors - check if sensor still exists (unlikely to change, resolve after 24h of no activity)
            elif category == 'temperature':
                if last_seen_hours > 24:
                    should_resolve = True
                    resolution_reason = 'Temperature error stale (>24h no activity)'

            # === UPDATES/SECURITY ERRORS ===
            # These are informational - auto-resolve after 7 days if acknowledged or stale
            elif category in ('updates', 'security'):
                if age_hours > 168:  # 7 days
                    should_resolve = True
                    resolution_reason = 'Update/security notice aged out (>7d)'

            # === FALLBACK: ANY STALE ERROR ===
            # Any error that hasn't been seen in 7 days and is older than 7 days
            if not should_resolve and age_hours > 168 and last_seen_hours > 168:
                should_resolve = True
                resolution_reason = 'Stale error (no activity >7d)'

            if should_resolve:
                cursor.execute('''
                    UPDATE errors SET resolved_at = ?, resolution_type = 'auto', resolution_reason = ?
                    WHERE id = ?
                ''', (now_iso, resolution_reason, err_id))
                resolved_count += 1

        if resolved_count > 0:
            conn.commit()
            print(f"[HealthPersistence] Auto-resolved {resolved_count} errors for stale/deleted resources")

        conn.close()

    def _check_vm_ct_exists(self, vmid: str) -> bool:
        """Check if a VM or CT exists (not just running, but exists at all).

        Uses 'qm config' and 'pct config' which return success even for stopped VMs/CTs,
        but fail if the VM/CT doesn't exist.
        """
        import subprocess

        try:
            # Try VM first
            result = subprocess.run(
                ['qm', 'config', vmid],
                capture_output=True,
                text=True,
                timeout=3
            )
            if result.returncode == 0:
                return True

            # Try CT
            result = subprocess.run(
                ['pct', 'config', vmid],
                capture_output=True,
                text=True,
                timeout=3
            )
            if result.returncode == 0:
                return True

            return False
        except subprocess.TimeoutExpired:
            # On timeout, assume it exists to avoid false positives
            return True
        except Exception as e:
            # On other errors (command not found, etc.), check if it's a "not found" error
            # If we can't determine, assume it doesn't exist to allow cleanup
            return False

    def check_vm_running(self, vm_id: str) -> bool:
        """
        Check if a VM/CT is running and resolve TRANSIENT errors if so.
        Also resolves error if VM/CT no longer exists.

        Only resolves errors that are likely to be fixed by a restart:
        - QMP command failures
        - Startup failures (generic)

        Does NOT resolve persistent configuration errors like:
        - Device missing
        - Permission issues

        Returns True if running/resolved, False otherwise.
        """
        import subprocess

        try:
            vm_exists = False
            ct_exists = False
            is_running = False
            vm_type = None

            # Check qm status for VMs
            result_vm = subprocess.run(
                ['qm', 'status', vm_id],
                capture_output=True,
                text=True,
                timeout=2
            )

            if result_vm.returncode == 0:
                vm_exists = True
                vm_type = 'vm'
                if 'running' in result_vm.stdout.lower():
                    is_running = True

            # Check pct status for containers
            if not vm_exists:
                result_ct = subprocess.run(
                    ['pct', 'status', vm_id],
                    capture_output=True,
                    text=True,
                    timeout=2
                )

                if result_ct.returncode == 0:
                    ct_exists = True
                    vm_type = 'ct'
                    if 'running' in result_ct.stdout.lower():
                        is_running = True

            # If neither VM nor CT exists, resolve ALL related errors
            if not vm_exists and not ct_exists:
                self.resolve_error(f'vm_{vm_id}', 'VM/CT deleted')
                self.resolve_error(f'ct_{vm_id}', 'VM/CT deleted')
                self.resolve_error(f'vmct_{vm_id}', 'VM/CT deleted')
                return True

            # If running, only resolve TRANSIENT errors (QMP, startup)
            # Do NOT resolve persistent config errors (device missing, permissions)
            if is_running:
                conn = self._get_conn()
                cursor = conn.cursor()

                # Get the error details to check if it's a persistent config error
                for prefix in (f'{vm_type}_{vm_id}', f'vmct_{vm_id}'):
                    cursor.execute('''
                        SELECT error_key, reason FROM errors
                        WHERE error_key = ? AND resolved_at IS NULL
                    ''', (prefix,))
                    row = cursor.fetchone()
                    if row:
                        reason = (row[1] or '').lower()
                        # Check if this is a persistent config error that won't be fixed by restart
                        is_persistent_config = any(indicator in reason for indicator in [
                            'device', 'missing', 'does not exist', 'permission',
                            'not found', 'no such', 'invalid'
                        ])

                        if not is_persistent_config:
                            # Transient error - resolve it
                            self.resolve_error(prefix, f'{vm_type.upper()} started successfully')

                conn.close()
                return True

            return False

        except Exception:
            return False

    def get_dismissed_errors(self) -> List[Dict[str, Any]]:
        """
        Get errors that were acknowledged/dismissed but still within suppression period.
        These are shown as INFO in the frontend with a 'Dismissed' badge.
        """
        conn = self._get_conn()
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()

        cursor.execute('''
            SELECT * FROM errors
            WHERE acknowledged = 1
            ORDER BY acknowledged_at DESC
        ''')

        rows = cursor.fetchall()
        conn.close()

        dismissed = []
        now = datetime.now()

        for row in rows:
            error_dict = dict(row)
            if error_dict.get('details'):
                try:
                    error_dict['details'] = json.loads(error_dict['details'])
                except (json.JSONDecodeError, TypeError):
                    pass

            # Check if still within suppression period using per-record hours
            # Use acknowledged_at as reference (resolved_at may be NULL for dismissed but active errors)
            try:
                ref_time_str = error_dict.get('acknowledged_at') or error_dict.get('resolved_at')
                if not ref_time_str:
                    continue
                ref_dt = datetime.fromisoformat(ref_time_str)
                sup_hours = error_dict.get('suppression_hours')
                if sup_hours is None:
                    sup_hours = self.DEFAULT_SUPPRESSION_HOURS

                error_dict['dismissed'] = True

                if sup_hours == -1:
                    # Permanent dismiss
                    error_dict['suppression_remaining_hours'] = -1
                    error_dict['permanent'] = True
                    dismissed.append(error_dict)
                else:
                    elapsed_seconds = (now - ref_dt).total_seconds()
                    suppression_seconds = sup_hours * 3600

                    if elapsed_seconds < suppression_seconds:
                        error_dict['suppression_remaining_hours'] = round(
                            (suppression_seconds - elapsed_seconds) / 3600, 1
                        )
                        error_dict['permanent'] = False
                        dismissed.append(error_dict)
            except (ValueError, TypeError):
                pass

        return dismissed

    def emit_event(self, event_type: str, category: str, severity: str,
                   data: Optional[Dict] = None) -> int:
        """
        Emit a health event for the notification system.
        Returns the event ID.

        Event types:
        - 'state_change': severity changed (OK->WARNING, WARNING->CRITICAL, etc.)
        - 'new_error': new error detected
        - 'resolved': error resolved
        - 'escalated': severity increased
        """
        conn = self._get_conn()
        cursor = conn.cursor()

        event_data = data or {}
        event_data['category'] = category
        event_data['severity'] = severity
        event_data['needs_notification'] = True

        cursor.execute('''
            INSERT INTO events (event_type, error_key, timestamp, data)
            VALUES (?, ?, ?, ?)
        ''', (event_type, f'{category}_{severity}', datetime.now().isoformat(),
              json.dumps(event_data)))

        event_id = cursor.lastrowid
        conn.commit()
        conn.close()
        return event_id

    def get_pending_notifications(self) -> List[Dict[str, Any]]:
        """
        Get events that need notification (for future Telegram/Gotify integration).
        Groups by severity for batch notification sending.
        """
        conn = self._get_conn()
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()

        cursor.execute('''
            SELECT e.*, err.category as error_category, err.reason as error_reason
            FROM events e
            LEFT JOIN errors err ON e.error_key = err.error_key
            WHERE json_extract(e.data, '$.needs_notification') = 1
            ORDER BY e.timestamp DESC
            LIMIT 100
        ''')

        rows = cursor.fetchall()
        conn.close()

        events = []
        for row in rows:
            event_dict = dict(row)
            if event_dict.get('data'):
                try:
                    event_dict['data'] = json.loads(event_dict['data'])
                except (json.JSONDecodeError, TypeError):
                    pass
            events.append(event_dict)

        return events

    def mark_events_notified(self, event_ids: List[int]):
        """Mark events as notified (notification was sent successfully)"""
        if not event_ids:
            return

        conn = self._get_conn()
        cursor = conn.cursor()

        # Use single UPDATE with IN clause instead of N individual updates
        now = datetime.now().isoformat()
        placeholders = ','.join('?' * len(event_ids))
        cursor.execute(f'''
            UPDATE events
            SET data = json_set(COALESCE(data, '{{}}'), '$.needs_notification', 0, '$.notified_at', ?)
            WHERE id IN ({placeholders})
        ''', [now] + event_ids)

        conn.commit()
        conn.close()

    def _record_event(self, cursor, event_type: str, error_key: str, data: Dict):
        """Internal: Record an event"""
        cursor.execute('''
            INSERT INTO events (event_type, error_key, timestamp, data)
            VALUES (?, ?, ?, ?)
        ''', (event_type, error_key, datetime.now().isoformat(), json.dumps(data)))

    def get_unnotified_errors(self) -> List[Dict[str, Any]]:
        """Get errors that need Telegram notification"""
        conn = self._get_conn()
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()

        cursor.execute('''
            SELECT * FROM errors
            WHERE notification_sent = 0
              AND resolved_at IS NULL
              AND acknowledged = 0
            ORDER BY severity DESC, first_seen ASC
        ''')

        rows = cursor.fetchall()
        conn.close()

        errors = []
        for row in rows:
            error_dict = dict(row)
            if error_dict.get('details'):
                error_dict['details'] = json.loads(error_dict['details'])
            errors.append(error_dict)

        return errors

    def mark_notified(self, error_key: str):
        """Mark error as notified"""
        conn = self._get_conn()
        cursor = conn.cursor()

        cursor.execute('''
            UPDATE errors
            SET notification_sent = 1
            WHERE error_key = ?
        ''', (error_key,))

        conn.commit()
        conn.close()

    # ─── System Capabilities Cache ───────────────────────────────

    def get_capability(self, cap_key: str) -> Optional[str]:
        """
        Get a cached system capability value.
        Returns None if not yet detected.
        """
        conn = self._get_conn()
        cursor = conn.cursor()
        cursor.execute(
            'SELECT cap_value FROM system_capabilities WHERE cap_key = ?',
            (cap_key,)
        )
        row = cursor.fetchone()
        conn.close()
        return row[0] if row else None

    def set_capability(self, cap_key: str, cap_value: str):
        """Store a system capability value (detected once, cached forever)."""
        conn = self._get_conn()
        cursor = conn.cursor()
        cursor.execute('''
            INSERT OR REPLACE INTO system_capabilities (cap_key, cap_value, detected_at)
            VALUES (?, ?, ?)
        ''', (cap_key, cap_value, datetime.now().isoformat()))
        conn.commit()
        conn.close()

    def get_all_capabilities(self) -> Dict[str, str]:
        """Get all cached system capabilities as a dict."""
        conn = self._get_conn()
        cursor = conn.cursor()
        cursor.execute('SELECT cap_key, cap_value FROM system_capabilities')
        rows = cursor.fetchall()
        conn.close()
        return {row[0]: row[1] for row in rows}

    # Note: System capabilities (has_zfs, has_lvm) are now derived at runtime
    # from Proxmox storage types in health_monitor.get_detailed_status()
    # This avoids redundant subprocess calls and ensures immediate detection
    # when the user adds new ZFS/LVM storage via Proxmox.

    # ─── User Settings ──────────────────────────────────────────

    def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]:
        """Get a user setting value by key."""
        with self._db_connection() as conn:
            return self._get_setting_impl(conn, key, default)

    def _get_setting_impl(self, conn, key: str, default: Optional[str] = None) -> Optional[str]:
        """Internal: get setting using existing connection (P4 fix - avoids nested connections)."""
        cursor = conn.cursor()
        cursor.execute(
            'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,)
        )
        row = cursor.fetchone()
        return row[0] if row else default

    def set_setting(self, key: str, value: str):
        """Store a user setting value."""
        with self._db_lock:
            conn = self._get_conn()
            cursor = conn.cursor()
            cursor.execute('''
                INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at)
                VALUES (?, ?, ?)
            ''', (key, value, datetime.now().isoformat()))
            conn.commit()
            conn.close()

    def get_all_settings(self, prefix: Optional[str] = None) -> Dict[str, str]:
        """Get all user settings, optionally filtered by key prefix."""
        conn = self._get_conn()
        cursor = conn.cursor()
        if prefix:
            cursor.execute(
                'SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?',
                (f'{prefix}%',)
            )
        else:
            cursor.execute('SELECT setting_key, setting_value FROM user_settings')
        rows = cursor.fetchall()
        conn.close()
        return {row[0]: row[1] for row in rows}

    def sync_dismissed_suppression(self):
        """
        Retroactively update all existing dismissed errors to match current
        user settings. Called when the user saves settings, so changes are
        effective immediately on already-dismissed items.

        For each dismissed error, looks up its category's configured hours
        and updates the suppression_hours column to match.
        """
        conn = self._get_conn()
        cursor = conn.cursor()

        # Build reverse map: category -> setting_key
        cat_to_setting = {v['category']: k
                          for k, v in self._get_category_labels().items()}

        # Get all current suppression settings
        current_settings = self.get_all_settings('suppress_')

        # Get all dismissed (acknowledged) errors
        cursor.execute('''
            SELECT id, error_key, category, suppression_hours
            FROM errors WHERE acknowledged = 1
        ''')
        dismissed = cursor.fetchall()

        updated_count = 0
        for err_id, error_key, category, old_hours in dismissed:
            setting_key = None
            for skey, meta in self._get_category_labels().items():
                if meta['category'] == category:
                    setting_key = skey
                    break

            if not setting_key:
                continue

            stored = current_settings.get(setting_key)
            new_hours = int(stored) if stored else self.DEFAULT_SUPPRESSION_HOURS

            if new_hours != old_hours:
                cursor.execute(
                    'UPDATE errors SET suppression_hours = ? WHERE id = ?',
                    (new_hours, err_id)
                )
                self._record_event(cursor, 'suppression_updated', error_key, {
                    'old_hours': old_hours,
                    'new_hours': new_hours,
                    'reason': 'settings_sync'
                })
                updated_count += 1

        conn.commit()
        conn.close()
        return updated_count

    def _get_category_labels(self) -> dict:
        """Internal helper for category label metadata."""
        return {
            'suppress_cpu': {'label': 'CPU Usage & Temperature', 'category': 'temperature', 'icon': 'cpu'},
            'suppress_memory': {'label': 'Memory & Swap', 'category': 'memory', 'icon': 'memory'},
            'suppress_storage': {'label': 'Storage Mounts & Space', 'category': 'storage', 'icon': 'storage'},
            'suppress_disks': {'label': 'Disk I/O & Errors', 'category': 'disks', 'icon': 'disk'},
            'suppress_network': {'label': 'Network Interfaces', 'category': 'network', 'icon': 'network'},
            'suppress_vms': {'label': 'VMs & Containers', 'category': 'vms', 'icon': 'vms'},
            'suppress_pve_services': {'label': 'PVE Services', 'category': 'pve_services', 'icon': 'services'},
            'suppress_logs': {'label': 'System Logs', 'category': 'logs', 'icon': 'logs'},
            'suppress_updates': {'label': 'System Updates', 'category': 'updates', 'icon': 'updates'},
            'suppress_security': {'label': 'Security & Certificates', 'category': 'security', 'icon': 'security'},
        }

    def get_suppression_categories(self) -> List[Dict[str, Any]]:
        """
        Get all health categories with their current suppression settings.
        Used by the settings page to render the per-category configuration.
        """
        category_labels = self._get_category_labels()
        current_settings = self.get_all_settings('suppress_')

        result = []
        for key, meta in category_labels.items():
            stored = current_settings.get(key)
            hours = int(stored) if stored else self.DEFAULT_SUPPRESSION_HOURS
            result.append({
                'key': key,
                'label': meta['label'],
                'category': meta['category'],
                'icon': meta['icon'],
                'hours': hours,
            })

        return result

    def get_custom_suppressions(self) -> List[Dict[str, Any]]:
        """
        Get only categories with non-default suppression settings.
        Used by the health modal to show a summary of custom suppressions.
        """
        all_cats = self.get_suppression_categories()
        return [c for c in all_cats if c['hours'] != self.DEFAULT_SUPPRESSION_HOURS]

    def record_unknown_persistent(self, category: str, reason: str):
        """
        Record a persistent UNKNOWN event when a health check has been
        unable to verify for >= 3 consecutive cycles (~15 min).
        Avoids duplicates by only recording once per 30 min per category.
        """
        with self._db_lock:
            self._record_unknown_persistent_impl(category, reason)

    def _record_unknown_persistent_impl(self, category, reason):
        try:
            event_key = f'unknown_persistent_{category}'
            now = datetime.now().isoformat()

            conn = self._get_conn()
            cursor = conn.cursor()

            # Check if we already recorded this within the last 30 minutes
            # Note: events table has columns (id, event_type, error_key, timestamp, data)
            # We use error_key for deduplication since it contains the category
            cursor.execute('''
                SELECT MAX(timestamp) FROM events
                WHERE event_type = ? AND error_key = ?
            ''', ('unknown_persistent', event_key))
            row = cursor.fetchone()
            if row and row[0]:
                try:
                    last_recorded = datetime.fromisoformat(row[0])
                    if (datetime.now() - last_recorded).total_seconds() < 1800:
                        conn.close()
                        return  # Already recorded recently
                except (ValueError, TypeError):
                    pass  # If timestamp is malformed, proceed with recording

            cursor.execute('''
                INSERT INTO events (event_type, error_key, timestamp, data)
                VALUES (?, ?, ?, ?)
            ''', ('unknown_persistent', event_key, now,
                  json.dumps({'category': category, 'reason': reason})))

            conn.commit()
            conn.close()
        except Exception as e:
            print(f"[HealthPersistence] Error recording UNKNOWN persistent: {e}")


    # ────────────────────────────────────────────────────────────────
    #  Disk Observations API
    # ────────────────────────────────────────────────────────────────

    def register_disk(self, device_name: str, serial: Optional[str] = None,
                      model: Optional[str] = None, size_bytes: Optional[int] = None):
        """Register or update a physical disk in the registry.

        Uses (device_name, serial) as unique key. If the disk was previously
        marked removed, it's re-activated.

        Also consolidates old ATA-named entries: if an observation was recorded
        under 'ata8' and we now know the real block device is 'sdh' with
        serial 'WX72...', update the old entry so observations are linked.
        """
        with self._db_lock:
            now = datetime.now().isoformat()
            try:
                # Use the context-managed connection so a fail in any cursor
                # call below still releases the SQLite handle. The previous
                # pattern only closed inside the success path, so a hardware
                # error or a corrupted row left the connection orphaned with
                # `timeout=30, busy_timeout=10000` — under load that
                # serialised every other writer.
                with self._db_connection() as conn:
                    cursor = conn.cursor()

                    # Consolidate: if serial is known and an old entry exists with
                    # a different device_name (e.g. 'ata8' instead of 'sdh'),
                    # update that entry's device_name so observations carry over.
                    if serial:
                        cursor.execute('''
                            SELECT id, device_name FROM disk_registry
                            WHERE serial = ? AND serial != '' AND device_name != ?
                        ''', (serial, device_name))
                        old_rows = cursor.fetchall()
                        for old_id, old_dev in old_rows:
                            # Only consolidate ATA names -> block device names
                            if old_dev.startswith('ata') and not device_name.startswith('ata'):
                                # Check if target (device_name, serial) already exists
                                cursor.execute(
                                    'SELECT id FROM disk_registry WHERE device_name = ? AND serial = ?',
                                    (device_name, serial))
                                existing = cursor.fetchone()
                                if existing:
                                    # Merge: move observations from old -> existing, then delete old
                                    cursor.execute(
                                        'UPDATE disk_observations SET disk_registry_id = ? WHERE disk_registry_id = ?',
                                        (existing[0], old_id))
                                    cursor.execute('DELETE FROM disk_registry WHERE id = ?', (old_id,))
                                else:
                                    # Rename the old entry to the real block device name
                                    cursor.execute(
                                        'UPDATE disk_registry SET device_name = ?, model = COALESCE(?, model), '
                                        'size_bytes = COALESCE(?, size_bytes), last_seen = ?, removed = 0 '
                                        'WHERE id = ?',
                                        (device_name, model, size_bytes, now, old_id))

                    # If no serial provided, check if a record WITH serial already exists for this device
                    # This prevents creating duplicate entries (one with serial, one without)
                    effective_serial = serial or ''
                    if not serial:
                        cursor.execute('''
                            SELECT serial FROM disk_registry
                            WHERE device_name = ? AND serial != ''
                            ORDER BY last_seen DESC LIMIT 1
                        ''', (device_name,))
                        existing = cursor.fetchone()
                        if existing and existing[0]:
                            effective_serial = existing[0]  # Use the existing serial

                    cursor.execute('''
                        INSERT INTO disk_registry (device_name, serial, model, size_bytes, first_seen, last_seen, removed)
                        VALUES (?, ?, ?, ?, ?, ?, 0)
                        ON CONFLICT(device_name, serial) DO UPDATE SET
                            model = COALESCE(excluded.model, model),
                            size_bytes = COALESCE(excluded.size_bytes, size_bytes),
                            last_seen = excluded.last_seen,
                            removed = 0
                    ''', (device_name, effective_serial, model, size_bytes, now, now))

                    conn.commit()
            except Exception as e:
                print(f"[HealthPersistence] Error registering disk {device_name}: {e}")

    def _get_disk_registry_id(self, cursor, device_name: str,
                               serial: Optional[str] = None,
                               prefer_with_observations: bool = True) -> Optional[int]:
        """Find disk_registry.id, matching by serial first, then device_name.

        Also handles ATA-to-block cross-references: if looking for 'sdh' also
        checks entries with ATA names that share the same serial.

        When prefer_with_observations=True, prioritizes records that have
        linked observations, which helps with USB disks that may have
        multiple registry entries (one with serial, one without).
        """
        clean_dev = device_name.replace('/dev/', '')

        if serial:
            cursor.execute(
                'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1',
                (serial,))
            row = cursor.fetchone()
            if row:
                return row[0]
        else:
            # No serial provided - first check if a record WITH serial exists for this device
            # This prevents returning a duplicate record without serial
            cursor.execute('''
                SELECT id FROM disk_registry
                WHERE device_name = ? AND serial != ''
                ORDER BY last_seen DESC LIMIT 1
            ''', (clean_dev,))
            row = cursor.fetchone()
            if row:
                return row[0]

        # Fallback: match by device_name

        if prefer_with_observations:
            # First try to find a registry entry that has observations linked
            # This handles USB disks where errors may be recorded under a different
            # registry entry (e.g., one without serial)
            cursor.execute('''
                SELECT dr.id FROM disk_registry dr
                LEFT JOIN disk_observations do ON dr.id = do.disk_registry_id
                WHERE dr.device_name = ?
                GROUP BY dr.id
                ORDER BY COUNT(do.id) DESC, dr.last_seen DESC
                LIMIT 1
            ''', (clean_dev,))
            row = cursor.fetchone()
            if row:
                return row[0]
        else:
            cursor.execute(
                'SELECT id FROM disk_registry WHERE device_name = ? ORDER BY last_seen DESC LIMIT 1',
                (clean_dev,))
            row = cursor.fetchone()
            if row:
                return row[0]

        # Last resort: search for ATA-named entries that might refer to this device
        # This handles cases where observations were recorded under 'ata8'
        # but we're querying for 'sdh'
        if clean_dev.startswith('sd') or clean_dev.startswith('nvme'):
            cursor.execute(
                'SELECT id FROM disk_registry WHERE device_name LIKE "ata%" ORDER BY last_seen DESC')
            # For each ATA entry, we can't resolve here without OS access,
            # so just return None and let the serial-based consolidation
            # in register_disk handle it over time.
            pass
        return None

    # NOTE: update_disk_worst_health, get_disk_health_status, clear_disk_health_history
    # were removed. The disk health badge now shows the CURRENT status from Proxmox/SMART
    # directly, not a persistent "worst_health". Historical observations are preserved
    # in disk_observations table and shown separately via the "X obs." badge.

    def record_disk_observation(self, device_name: str, serial: Optional[str],
                                 error_type: str, error_signature: str,
                                 raw_message: str = '',
                                 severity: str = 'warning'):
        """Record or deduplicate a disk error observation.

        error_type:  'smart_error', 'io_error', 'connection_error'
        error_signature: Normalized unique string for dedup (e.g. 'FailedReadSmartSelfTestLog')

        Serialized via `_db_lock`: this method does PRAGMA introspection +
        UPSERT in the same connection, and runs from journal/polling/webhook
        threads concurrently. Without serialization the dedup UPSERT could
        race with another thread's INSERT and produce duplicate rows in
        `disk_observations` for the same (disk, type, signature). Audit
        Tier 5 (Health stack — race conditions sin _db_lock).
        """
        now = datetime.now().isoformat()
        try:
            with self._db_lock:
                self._record_disk_observation_locked(
                    device_name, serial, error_type, error_signature,
                    raw_message, severity, now,
                )
        except Exception as e:
            print(f"[HealthPersistence] Error recording disk observation: {e}")
            return
        return

    def _record_disk_observation_locked(self, device_name, serial, error_type,
                                         error_signature, raw_message, severity, now):
        """Inner body of `record_disk_observation`, called under _db_lock."""
        # Use the context manager so a thrown exception inside any cursor
        # call still releases the SQLite handle. Mirrors the fix on
        # `register_disk` — both are hot-path writes from the dispatch loop.
        try:
            with self._db_connection() as conn:
                cursor = conn.cursor()

                # Auto-register the disk if not present
                clean_dev = device_name.replace('/dev/', '')
                self.register_disk(clean_dev, serial)

                disk_id = self._get_disk_registry_id(cursor, clean_dev, serial)
                if not disk_id:
                    return

                # Detect column names for backward compatibility with older schemas
                cursor.execute('PRAGMA table_info(disk_observations)')
                columns = [col[1] for col in cursor.fetchall()]

                # Map to actual column names (old vs new schema)
                type_col = 'error_type' if 'error_type' in columns else 'observation_type'
                first_col = 'first_occurrence' if 'first_occurrence' in columns else 'first_seen'
                last_col = 'last_occurrence' if 'last_occurrence' in columns else 'last_seen'

                # Upsert observation: if same (disk, type, signature), bump count + update last timestamp.
                # IMPORTANT: Do NOT reset dismissed — if the user dismissed this observation,
                # re-detecting the same journal entry must not un-dismiss it. Also do not
                # increment the occurrence_count on dismissed rows (audit Tier 5 — once
                # the user has dismissed, we don't want the counter to keep growing for
                # journal events that no longer interest them; this also stops the badge
                # from drifting upward for dismissed conditions).
                cursor.execute(f'''
                    INSERT INTO disk_observations
                        (disk_registry_id, {type_col}, error_signature, {first_col},
                         {last_col}, occurrence_count, raw_message, severity, dismissed)
                    VALUES (?, ?, ?, ?, ?, 1, ?, ?, 0)
                    ON CONFLICT(disk_registry_id, {type_col}, error_signature) DO UPDATE SET
                        {last_col} = excluded.{last_col},
                        occurrence_count = occurrence_count + 1,
                        severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END
                    WHERE dismissed = 0
                ''', (disk_id, error_type, error_signature, now, now, raw_message, severity))

                conn.commit()
                # Observation recorded - worst_health no longer updated (badge shows current SMART status)

        except Exception as e:
            print(f"[HealthPersistence] Error recording disk observation: {e}")

    def get_disk_observations(self, device_name: Optional[str] = None,
                               serial: Optional[str] = None) -> List[Dict[str, Any]]:
        """Get active (non-dismissed) observations for one disk or all disks.

        For USB disks that may have multiple registry entries (one with serial,
        one without), this searches ALL registry entries matching the device_name
        to ensure observations are found regardless of which entry recorded them.
        """
        try:
            conn = self._get_conn()
            cursor = conn.cursor()

            # Detect column names for backward compatibility with older schemas
            cursor.execute('PRAGMA table_info(disk_observations)')
            columns = [col[1] for col in cursor.fetchall()]

            type_col = 'error_type' if 'error_type' in columns else 'observation_type'
            first_col = 'first_occurrence' if 'first_occurrence' in columns else 'first_seen'
            last_col = 'last_occurrence' if 'last_occurrence' in columns else 'last_seen'

            if device_name or serial:
                clean_dev = (device_name or '').replace('/dev/', '')

                # Get ALL disk_registry IDs that match this device_name
                # This handles USB disks with multiple registry entries
                cursor.execute(
                    'SELECT id FROM disk_registry WHERE device_name = ?',
                    (clean_dev,))
                all_ids = [row[0] for row in cursor.fetchall()]

                # Also try to find by serial if provided
                if serial:
                    cursor.execute(
                        'SELECT id FROM disk_registry WHERE serial = ? AND serial != ""',
                        (serial,))
                    serial_ids = [row[0] for row in cursor.fetchall()]
                    all_ids = list(set(all_ids + serial_ids))

                if not all_ids:
                    conn.close()
                    return []

                # Query observations for ALL matching registry entries
                placeholders = ','.join('?' * len(all_ids))
                cursor.execute(f'''
                    SELECT o.id, o.{type_col}, o.error_signature,
                           o.{first_col}, o.{last_col},
                           o.occurrence_count, o.raw_message, o.severity, o.dismissed,
                           d.device_name, d.serial, d.model
                    FROM disk_observations o
                    JOIN disk_registry d ON o.disk_registry_id = d.id
                    WHERE o.disk_registry_id IN ({placeholders}) AND o.dismissed = 0
                    ORDER BY o.{last_col} DESC
                ''', all_ids)
            else:
                cursor.execute(f'''
                    SELECT o.id, o.{type_col}, o.error_signature,
                           o.{first_col}, o.{last_col},
                           o.occurrence_count, o.raw_message, o.severity, o.dismissed,
                           d.device_name, d.serial, d.model
                    FROM disk_observations o
                    JOIN disk_registry d ON o.disk_registry_id = d.id
                    WHERE o.dismissed = 0
                    ORDER BY o.{last_col} DESC
                ''')

            rows = cursor.fetchall()
            conn.close()

            return [{
                'id': r[0],
                'error_type': r[1],
                'error_signature': r[2],
                'first_occurrence': r[3],
                'last_occurrence': r[4],
                'occurrence_count': r[5],
                'raw_message': r[6] or '',
                'severity': r[7],
                'dismissed': bool(r[8]),
                'device_name': r[9],
                'serial': r[10],
                'model': r[11],
            } for r in rows]
        except Exception as e:
            print(f"[HealthPersistence] Error getting observations: {e}")
            return []

    def get_all_observed_devices(self) -> List[Dict[str, Any]]:
        """Return a list of unique device_name + serial pairs that have observations.

        `device_name` and `serial` live on `disk_registry`, not on
        `disk_observations` — the original query referenced columns that
        don't exist and silently returned `[]` because the OperationalError
        was swallowed by the broad `except`. Joined to the registry so the
        function actually works.
        """
        try:
            with self._db_connection() as conn:
                cursor = conn.cursor()
                cursor.execute('''
                    SELECT DISTINCT dr.device_name, dr.serial
                    FROM disk_observations o
                    JOIN disk_registry dr ON o.disk_registry_id = dr.id
                    WHERE o.dismissed = 0
                ''')
                rows = cursor.fetchall()
                return [{'device_name': r[0], 'serial': r[1] or ''} for r in rows]
        except Exception as e:
            print(f"[HealthPersistence] get_all_observed_devices failed: {e}")
            return []

    def get_disks_observation_counts(self) -> Dict[str, int]:
        """Return {device_name: count} of active observations per disk.

        Groups by serial when available to consolidate counts across device name changes
        (e.g., ata8 -> sdh). Also includes serial-keyed entries for cross-device matching.
        """
        try:
            conn = self._get_conn()
            cursor = conn.cursor()

            # For disks WITH serial: group by serial to consolidate across device renames
            cursor.execute('''
                SELECT d.serial, COUNT(o.id) as cnt
                FROM disk_observations o
                JOIN disk_registry d ON o.disk_registry_id = d.id
                WHERE o.dismissed = 0 AND d.serial IS NOT NULL AND d.serial != ''
                GROUP BY d.serial
            ''')
            serial_counts = {row[0]: row[1] for row in cursor.fetchall()}

            # Get current device_name for each serial (prefer non-ata names)
            cursor.execute('''
                SELECT serial, device_name FROM disk_registry
                WHERE serial IS NOT NULL AND serial != ''
                ORDER BY
                    CASE WHEN device_name LIKE 'ata%' THEN 1 ELSE 0 END,
                    last_seen DESC
            ''')
            serial_to_device = {}
            for serial, device_name in cursor.fetchall():
                if serial not in serial_to_device:
                    serial_to_device[serial] = device_name

            # Build result
            result = {}
            for serial, cnt in serial_counts.items():
                result[f'serial:{serial}'] = cnt
                device_name = serial_to_device.get(serial)
                if device_name:
                    result[device_name] = max(result.get(device_name, 0), cnt)

            # For disks WITHOUT serial: group by device_name
            cursor.execute('''
                SELECT d.device_name, COUNT(o.id) as cnt
                FROM disk_observations o
                JOIN disk_registry d ON o.disk_registry_id = d.id
                WHERE o.dismissed = 0 AND (d.serial IS NULL OR d.serial = '')
                GROUP BY d.device_name
            ''')
            for device_name, cnt in cursor.fetchall():
                result[device_name] = max(result.get(device_name, 0), cnt)

            conn.close()
            return result
        except Exception as e:
            print(f"[HealthPersistence] Error getting observation counts: {e}")
            return {}

    def dismiss_disk_observation(self, observation_id: int):
        """Mark a single observation as dismissed."""
        try:
            conn = self._get_conn()
            cursor = conn.cursor()
            cursor.execute(
                'UPDATE disk_observations SET dismissed = 1 WHERE id = ?',
                (observation_id,))
            conn.commit()
            conn.close()
        except Exception as e:
            print(f"[HealthPersistence] Error dismissing observation: {e}")

    def cleanup_stale_observations(self, max_age_days: int = 30):
        """Auto-dismiss observations not seen in max_age_days."""
        try:
            from datetime import timedelta
            cutoff = (datetime.now() - timedelta(days=max_age_days)).isoformat()
            conn = self._get_conn()
            cursor = conn.cursor()

            # Detect column name for backward compatibility
            cursor.execute('PRAGMA table_info(disk_observations)')
            columns = [col[1] for col in cursor.fetchall()]
            last_col = 'last_occurrence' if 'last_occurrence' in columns else 'last_seen'

            cursor.execute(f'''
                UPDATE disk_observations
                SET dismissed = 1
                WHERE dismissed = 0 AND {last_col} < ?
            ''', (cutoff,))
            conn.commit()
            conn.close()
        except Exception as e:
            print(f"[HealthPersistence] Error cleaning stale observations: {e}")

    def mark_removed_disks(self, active_device_names: List[str]):
        """Mark disks not in active_device_names as removed."""
        try:
            now = datetime.now().isoformat()
            conn = self._get_conn()
            cursor = conn.cursor()
            if active_device_names:
                placeholders = ','.join('?' for _ in active_device_names)
                cursor.execute(f'''
                    UPDATE disk_registry SET removed = 1
                    WHERE device_name NOT IN ({placeholders}) AND removed = 0
                ''', active_device_names)
            conn.commit()
            conn.close()
        except Exception as e:
            print(f"[HealthPersistence] Error marking removed disks: {e}")

    # Logical (non-block) device-name prefixes used as observation keys for events that
    # don't map to a /dev/<name> entry: ZFS pool names, ATA host identifiers (e.g. "ata8"
    # from "ata8.00: exception ..." journal lines), device-mapper aliases, etc. These are
    # never visible in /dev/ by design, so the original presence-based cleanup would
    # always wrongly dismiss them. They are excluded from automatic cleanup; the user's
    # explicit "clean up disconnected disks" action also skips them.
    _LOGICAL_DEVICE_PREFIXES = ('zpool_', 'ata', 'dm-', 'nbd', 'loop', 'sr')

    def cleanup_orphan_observations(self):
        """
        Dismiss observations for devices that no longer exist in /dev/.
        Useful for cleaning up after USB drives or temporary devices are disconnected.

        Observations whose `device_name` uses a logical (non-block) prefix are skipped —
        ZFS pools, ATA hosts and dm-* aliases never appear under /dev/ by design and were
        being silently dismissed by the previous version of this routine.
        """
        import os
        import re
        try:
            conn = self._get_conn()
            cursor = conn.cursor()

            # Get all active (non-dismissed) observations with device info from disk_registry
            cursor.execute('''
                SELECT do.id, dr.device_name, dr.serial
                FROM disk_observations do
                JOIN disk_registry dr ON do.disk_registry_id = dr.id
                WHERE do.dismissed = 0
            ''')
            observations = cursor.fetchall()

            dismissed_count = 0
            for obs_id, device_name, serial in observations:
                # Skip non-block observations (ZFS pools, ATA hosts, dm-mapper, etc.)
                if device_name and device_name.startswith(self._LOGICAL_DEVICE_PREFIXES):
                    continue
                # Check if device exists
                dev_path = f'/dev/{device_name}'
                # Also check base device (remove partition number)
                base_dev = disk_base_name(device_name)
                base_path = f'/dev/{base_dev}'

                if not os.path.exists(dev_path) and not os.path.exists(base_path):
                    cursor.execute('''
                        UPDATE disk_observations SET dismissed = 1
                        WHERE id = ?
                    ''', (obs_id,))
                    dismissed_count += 1

            conn.commit()
            conn.close()
            if dismissed_count > 0:
                print(f"[HealthPersistence] Cleaned up {dismissed_count} orphan observations")
            return dismissed_count
        except Exception as e:
            print(f"[HealthPersistence] Error cleaning orphan observations: {e}")
            return 0


    # ── Remote Storage Exclusions Methods ──

    # Types considered "remote" and eligible for exclusion
    REMOTE_STORAGE_TYPES = {'pbs', 'nfs', 'cifs', 'glusterfs', 'iscsi', 'iscsidirect', 'cephfs', 'rbd'}

    def is_remote_storage_type(self, storage_type: str) -> bool:
        """Check if a storage type is considered remote/external."""
        return storage_type.lower() in self.REMOTE_STORAGE_TYPES

    def get_excluded_storages(self) -> List[Dict[str, Any]]:
        """Get list of all excluded remote storages."""
        try:
            with self._db_connection(row_factory=True) as conn:
                cursor = conn.cursor()
                cursor.execute('''
                    SELECT storage_name, storage_type, excluded_at,
                           exclude_health, exclude_notifications, reason
                    FROM excluded_storages
                ''')
                return [dict(row) for row in cursor.fetchall()]
        except Exception as e:
            print(f"[HealthPersistence] Error getting excluded storages: {e}")
            return []

    def is_storage_excluded(self, storage_name: str, check_type: str = 'health') -> bool:
        """
        Check if a storage is excluded from monitoring.

        Args:
            storage_name: Name of the storage
            check_type: 'health' or 'notifications'

        Returns:
            True if storage is excluded for the given check type
        """
        try:
            with self._db_connection() as conn:
                cursor = conn.cursor()
                column = 'exclude_health' if check_type == 'health' else 'exclude_notifications'
                cursor.execute(f'''
                    SELECT {column} FROM excluded_storages
                    WHERE storage_name = ?
                ''', (storage_name,))
                row = cursor.fetchone()
                return row is not None and row[0] == 1
        except Exception:
            return False

    def exclude_storage(self, storage_name: str, storage_type: str,
                       exclude_health: bool = True, exclude_notifications: bool = True,
                       reason: str = None) -> bool:
        """
        Add a storage to the exclusion list.

        Args:
            storage_name: Name of the storage to exclude
            storage_type: Type of storage (pbs, nfs, etc.)
            exclude_health: Whether to exclude from health monitoring
            exclude_notifications: Whether to exclude from notifications
            reason: Optional reason for exclusion

        Returns:
            True if successfully excluded
        """
        try:
            now = datetime.now().isoformat()
            with self._db_connection() as conn:
                cursor = conn.cursor()
                cursor.execute('''
                    INSERT INTO excluded_storages
                    (storage_name, storage_type, excluded_at, exclude_health, exclude_notifications, reason)
                    VALUES (?, ?, ?, ?, ?, ?)
                    ON CONFLICT(storage_name) DO UPDATE SET
                        exclude_health = excluded.exclude_health,
                        exclude_notifications = excluded.exclude_notifications,
                        reason = excluded.reason
                ''', (storage_name, storage_type, now,
                      1 if exclude_health else 0,
                      1 if exclude_notifications else 0,
                      reason))
                conn.commit()
                return True
        except Exception as e:
            print(f"[HealthPersistence] Error excluding storage: {e}")
            return False

    def update_storage_exclusion(self, storage_name: str,
                                 exclude_health: Optional[bool] = None,
                                 exclude_notifications: Optional[bool] = None) -> bool:
        """
        Update exclusion settings for a storage.

        Args:
            storage_name: Name of the storage
            exclude_health: New value for health exclusion (None = don't change)
            exclude_notifications: New value for notifications exclusion (None = don't change)

        Returns:
            True if successfully updated
        """
        try:
            with self._db_connection() as conn:
                cursor = conn.cursor()

                updates = []
                values = []

                if exclude_health is not None:
                    updates.append('exclude_health = ?')
                    values.append(1 if exclude_health else 0)

                if exclude_notifications is not None:
                    updates.append('exclude_notifications = ?')
                    values.append(1 if exclude_notifications else 0)

                if not updates:
                    return True

                values.append(storage_name)
                cursor.execute(f'''
                    UPDATE excluded_storages
                    SET {', '.join(updates)}
                    WHERE storage_name = ?
                ''', values)
                conn.commit()
                return cursor.rowcount > 0
        except Exception as e:
            print(f"[HealthPersistence] Error updating storage exclusion: {e}")
            return False

    def remove_storage_exclusion(self, storage_name: str) -> bool:
        """Remove a storage from the exclusion list."""
        try:
            with self._db_connection() as conn:
                cursor = conn.cursor()
                cursor.execute('''
                    DELETE FROM excluded_storages WHERE storage_name = ?
                ''', (storage_name,))
                conn.commit()
                return cursor.rowcount > 0
        except Exception as e:
            print(f"[HealthPersistence] Error removing storage exclusion: {e}")
            return False

    def get_excluded_storage_names(self, check_type: str = 'health') -> set:
        """
        Get set of storage names excluded for a specific check type.

        Args:
            check_type: 'health' or 'notifications'

        Returns:
            Set of excluded storage names
        """
        try:
            with self._db_connection() as conn:
                cursor = conn.cursor()
                column = 'exclude_health' if check_type == 'health' else 'exclude_notifications'
                cursor.execute(f'''
                    SELECT storage_name FROM excluded_storages
                    WHERE {column} = 1
                ''')
                return {row[0] for row in cursor.fetchall()}
        except Exception:
            return set()

    # ═══════════════════════════════════════════════════════════════════════════
    # NETWORK INTERFACE EXCLUSION MANAGEMENT
    # ═══════════════════════════════════════════════════════════════════════════

    def get_excluded_interfaces(self) -> List[Dict[str, Any]]:
        """Get list of all excluded network interfaces."""
        try:
            with self._db_connection(row_factory=True) as conn:
                cursor = conn.cursor()
                cursor.execute('''
                    SELECT interface_name, interface_type, excluded_at,
                           exclude_health, exclude_notifications, reason
                    FROM excluded_interfaces
                ''')
                return [dict(row) for row in cursor.fetchall()]
        except Exception as e:
            print(f"[HealthPersistence] Error getting excluded interfaces: {e}")
            return []

    def is_interface_excluded(self, interface_name: str, check_type: str = 'health') -> bool:
        """
        Check if a network interface is excluded from monitoring.

        Args:
            interface_name: Name of the interface (e.g., 'vmbr0', 'eth0')
            check_type: 'health' or 'notifications'

        Returns:
            True if the interface is excluded for the given check type
        """
        try:
            with self._db_connection() as conn:
                cursor = conn.cursor()
                column = 'exclude_health' if check_type == 'health' else 'exclude_notifications'
                cursor.execute(f'''
                    SELECT 1 FROM excluded_interfaces
                    WHERE interface_name = ? AND {column} = 1
                ''', (interface_name,))
                return cursor.fetchone() is not None
        except Exception:
            return False

    def exclude_interface(self, interface_name: str, interface_type: str,
                         exclude_health: bool = True, exclude_notifications: bool = True,
                         reason: str = None) -> bool:
        """
        Add a network interface to the exclusion list.

        Args:
            interface_name: Name of the interface (e.g., 'vmbr0')
            interface_type: Type of interface ('bridge', 'physical', 'bond', 'vlan')
            exclude_health: Whether to exclude from health monitoring
            exclude_notifications: Whether to exclude from notifications
            reason: Optional reason for exclusion

        Returns:
            True if successful
        """
        try:
            with self._db_connection() as conn:
                cursor = conn.cursor()
                cursor.execute('''
                    INSERT OR REPLACE INTO excluded_interfaces
                    (interface_name, interface_type, excluded_at, exclude_health, exclude_notifications, reason)
                    VALUES (?, ?, ?, ?, ?, ?)
                ''', (
                    interface_name,
                    interface_type,
                    datetime.now().isoformat(),
                    1 if exclude_health else 0,
                    1 if exclude_notifications else 0,
                    reason
                ))
                conn.commit()
                print(f"[HealthPersistence] Interface {interface_name} added to exclusions")
                return True
        except Exception as e:
            print(f"[HealthPersistence] Error excluding interface: {e}")
            return False

    def update_interface_exclusion(self, interface_name: str,
                                   exclude_health: bool, exclude_notifications: bool) -> bool:
        """Update exclusion settings for an interface."""
        try:
            with self._db_connection() as conn:
                cursor = conn.cursor()
                cursor.execute('''
                    UPDATE excluded_interfaces
                    SET exclude_health = ?, exclude_notifications = ?
                    WHERE interface_name = ?
                ''', (1 if exclude_health else 0, 1 if exclude_notifications else 0, interface_name))
                conn.commit()
                return cursor.rowcount > 0
        except Exception as e:
            print(f"[HealthPersistence] Error updating interface exclusion: {e}")
            return False

    def remove_interface_exclusion(self, interface_name: str) -> bool:
        """Remove an interface from the exclusion list."""
        try:
            with self._db_connection() as conn:
                cursor = conn.cursor()
                cursor.execute('DELETE FROM excluded_interfaces WHERE interface_name = ?', (interface_name,))
                conn.commit()
                removed = cursor.rowcount > 0
                if removed:
                    print(f"[HealthPersistence] Interface {interface_name} removed from exclusions")
                return removed
        except Exception as e:
            print(f"[HealthPersistence] Error removing interface exclusion: {e}")
            return False

    def get_excluded_interface_names(self, check_type: str = 'health') -> set:
        """
        Get set of interface names excluded for a specific check type.

        Args:
            check_type: 'health' or 'notifications'

        Returns:
            Set of excluded interface names
        """
        try:
            with self._db_connection() as conn:
                cursor = conn.cursor()
                column = 'exclude_health' if check_type == 'health' else 'exclude_notifications'
                cursor.execute(f'''
                    SELECT interface_name FROM excluded_interfaces
                    WHERE {column} = 1
                ''')
                return {row[0] for row in cursor.fetchall()}
        except Exception:
            return set()


    def _clear_notification_cooldown(self, error_key: str):
        """
        Clear notification cooldown from notification_last_sent for non-disk errors.

        This coordinates with PollingCollector's 24h cooldown system.
        When any error is dismissed, we remove the corresponding cooldown entry
        so the error can be re-detected and re-notified after the suppression period expires.

        The PollingCollector uses 'health_' prefix for all its fingerprints.
        Audit Tier 5 (Health stack — `_clear_notification_cooldown` LIKE
        overmatch): the previous implementation had a fallback
        ``DELETE ... WHERE fingerprint LIKE '%<error_key>%'`` which broke as
        soon as two errors shared a substring (e.g. ``vm_1`` matched ``vm_10``,
        ``vm_100``, ``vm_1xyz``...). We drop that catch-all and rely on
        deterministic exact matches.
        """
        try:
            conn = self._get_conn()
            cursor = conn.cursor()

            # Match all the prefixes the PollingCollector uses for this key.
            # Anchored to the start, no wildcards inside, so we can never
            # over-match a different error.
            fingerprints = (
                error_key,
                f'health_{error_key}',
            )
            placeholders = ','.join('?' for _ in fingerprints)
            cursor.execute(
                f'DELETE FROM notification_last_sent WHERE fingerprint IN ({placeholders})',
                fingerprints,
            )

            deleted_count = cursor.rowcount
            conn.commit()
            conn.close()

            if deleted_count > 0:
                print(f"[HealthPersistence] Cleared notification cooldowns for {error_key}")
        except Exception as e:
            print(f"[HealthPersistence] Error clearing notification cooldown: {e}")

    def _clear_disk_io_cooldown(self, error_key: str):
        """
        Clear disk I/O cooldowns from notification_last_sent when an error is dismissed.

        This coordinates with BOTH:
        1. JournalWatcher's 24h cooldown system (prefixes: diskio_, fs_, fs_serial_)
        2. PollingCollector's 24h cooldown system (prefix: health_)

        When a disk error is dismissed, we remove the corresponding cooldown entries
        so the error can be re-detected and re-notified after the suppression period expires.

        Matches fingerprints like:
        - diskio_sdh, diskio_sda, diskio_nvme0n1
        - fs_sdh1, fs_sda2, fs_serial_XXXXX
        - health_disk_smart_sdh, health_disk_io_error_sdh
        - sdh (direct device name used by JournalWatcher)
        """
        try:
            # Extract device name from error_key
            # Common patterns: disk_fs_sdh, disk_smart_sda, disk_io_error_sdh, smart_sdh
            import re
            device_match = re.search(r'(?:disk_fs_|disk_smart_|disk_io_error_|disk_|smart_|io_error_)(?:/dev/)?([a-z]{2,4}[a-z0-9]*)', error_key)
            if not device_match:
                # Try to extract device from error_key directly if no pattern matches
                # e.g., error_key might just be the device name
                device_match = re.match(r'^([a-z]{2,4}[a-z0-9]*)$', error_key)
                if not device_match:
                    return

            device = device_match.group(1)
            base_device = disk_base_name(device)  # sdh1 → sdh, nvme0n1p1 → nvme0n1

            # Build patterns to match in notification_last_sent
            # JournalWatcher uses: direct device name, diskio_, fs_, fs_serial_
            # PollingCollector uses: health_ prefix
            patterns = [
                # JournalWatcher patterns
                device,  # Direct device name (JournalWatcher._check_disk_io uses this)
                base_device,
                f'diskio_{device}',
                f'diskio_{base_device}',
                f'fs_{device}',
                f'fs_{base_device}',
                # PollingCollector patterns (uses health_ prefix)
                f'health_{error_key}',
                f'health_disk_smart_{device}',
                f'health_disk_smart_{base_device}',
                f'health_disk_io_error_{device}',
                f'health_disk_io_error_{base_device}',
                f'health_disk_fs_{device}',
                f'health_disk_fs_{base_device}',
            ]

            conn = self._get_conn()
            cursor = conn.cursor()

            # Delete matching cooldown entries
            for pattern in patterns:
                cursor.execute(
                    'DELETE FROM notification_last_sent WHERE fingerprint = ?',
                    (pattern,)
                )
                # Also match with wildcards for serial-based keys
                cursor.execute(
                    'DELETE FROM notification_last_sent WHERE fingerprint LIKE ?',
                    (f'{pattern}%',)
                )

            # Also clear fingerprints that contain the device name anywhere
            # This catches edge cases like different fingerprint formats
            cursor.execute(
                'DELETE FROM notification_last_sent WHERE fingerprint LIKE ? OR fingerprint LIKE ?',
                (f'%{device}%', f'%{base_device}%' if base_device != device else f'%{device}%')
            )

            conn.commit()
            conn.close()
            print(f"[HealthPersistence] Cleared disk I/O cooldowns for {error_key} (device: {device})")
        except Exception as e:
            print(f"[HealthPersistence] Error clearing disk I/O cooldown: {e}")


# Global instance
health_persistence = HealthPersistence()