ProxMenux/AppImage/scripts/health_monitor.py

"""
ProxMenux Health Monitor Module
Provides comprehensive, lightweight health checks for Proxmox systems.
Optimized for minimal system impact with intelligent thresholds and hysteresis.

Author: MacRimi
Version: 1.2 (Always returns all 10 categories)
"""

import psutil
import subprocess
import json
import time
import os
import hashlib # Added for MD5 hashing
from typing import Dict, List, Any, Tuple, Optional
from datetime import datetime, timedelta
from collections import defaultdict
import re

from health_persistence import health_persistence

try:
    from proxmox_storage_monitor import proxmox_storage_monitor
    PROXMOX_STORAGE_AVAILABLE = True
except ImportError:
    PROXMOX_STORAGE_AVAILABLE = False

# ============================================================================
# PERFORMANCE DEBUG FLAG - Set to True to log timing of each health check
# To analyze: grep "\[PERF\]" /var/log/proxmenux-monitor.log | sort -t'=' -k2 -n
# Set to False or remove this section after debugging
# ============================================================================
DEBUG_PERF = False

# ─── Startup Grace Period ────────────────────────────────────────────────────
# Import centralized startup grace management for consistent behavior
import startup_grace

def _is_startup_health_grace() -> bool:
    """Check if we're within the startup health grace period (5 min).

    Uses centralized startup_grace module for consistency across all components.
    """
    return startup_grace.is_startup_health_grace()

def _perf_log(section: str, elapsed_ms: float):
    """Log performance timing for a section. Only logs if DEBUG_PERF is True."""
    if DEBUG_PERF:
        print(f"[PERF] {section} = {elapsed_ms:.1f}ms")

class HealthMonitor:
    """
    Monitors system health across multiple components with minimal impact.
    Implements hysteresis, intelligent caching, progressive escalation, and persistent error tracking.
    Always returns all 10 health categories.
    """

    # CPU Thresholds
    CPU_WARNING = 85
    CPU_CRITICAL = 95
    CPU_RECOVERY = 75
    CPU_WARNING_DURATION = 300  # 5 minutes sustained
    CPU_CRITICAL_DURATION = 300  # 5 minutes sustained
    CPU_RECOVERY_DURATION = 120

    # Memory Thresholds
    MEMORY_WARNING = 85
    MEMORY_CRITICAL = 95
    MEMORY_DURATION = 60
    SWAP_WARNING_DURATION = 300
    SWAP_CRITICAL_PERCENT = 5
    SWAP_CRITICAL_DURATION = 120

    # Storage Thresholds
    STORAGE_WARNING = 85
    STORAGE_CRITICAL = 95

    # Temperature Thresholds
    TEMP_WARNING = 80
    TEMP_CRITICAL = 90

    # Network Thresholds
    NETWORK_LATENCY_WARNING = 100
    NETWORK_LATENCY_CRITICAL = 300
    NETWORK_TIMEOUT = 2
    NETWORK_INACTIVE_DURATION = 600

    # Log Thresholds
    LOG_ERRORS_WARNING = 5
    LOG_ERRORS_CRITICAL = 10
    LOG_WARNINGS_WARNING = 15
    LOG_WARNINGS_CRITICAL = 30
    LOG_CHECK_INTERVAL = 3420  # 57 min - offset to avoid sync with other hourly processes

    # Updates Thresholds
    UPDATES_WARNING = 365   # Only warn after 1 year without updates (system_age)
    UPDATES_CRITICAL = 548  # Critical after 18 months without updates
    SECURITY_WARN_DAYS = 360  # Security updates only become WARNING after 360 days unpatched

    BENIGN_ERROR_PATTERNS = [
        # ── Proxmox API / proxy operational noise ──
        r'got inotify poll request in wrong process',
        r'auth key pair too old, rotating',
        r'proxy detected vanished client connection',
        r'worker \d+ finished',
        r'connection timed out',
        r'disconnect peer',
        r'task OK',
        r'backup finished',
        # PVE ticket / auth transient errors (web UI session expiry, API token
        # refresh, brute-force bots). These are logged at WARNING/ERR level
        # but are NOT system problems -- they are access-control events.
        r'invalid PVE ticket',
        r'authentication failure.*pve',
        r'permission denied.*ticket',
        r'no ticket',
        r'CSRF.*failed',
        r'pveproxy\[\d+\]: authentication failure',
        r'pvedaemon\[\d+\]: authentication failure',
        # PVE cluster/corosync normal chatter
        r'corosync.*retransmit',
        r'corosync.*delivering',
        r'pmxcfs.*update',
        r'pve-cluster\[\d+\]:.*status',

        # ── Systemd informational messages ──
        r'(started|starting|stopped|stopping) session',
        r'session \d+ logged (in|out)',
        r'new session \d+ of user',
        r'removed session \d+',
        r'user@\d+\.service:',
        r'user runtime directory',
        # Systemd service restarts (normal lifecycle)
        r'systemd\[\d+\]: .+\.service: (Scheduled restart|Consumed)',
        r'systemd\[\d+\]: .+\.service: Deactivated successfully',

        # ── Network transient errors (common and usually self-recovering) ──
        r'dhcp.*timeout',
        r'temporary failure in name resolution',
        r'network is unreachable',
        r'no route to host',

        # ── Backup and sync normal warnings ──
        r'rsync.*vanished',
        r'backup job .* finished',
        r'vzdump backup .* finished',

        # ── ZFS informational ──
        r'zfs.*scrub (started|finished|in progress)',
        r'zpool.*resilver',

        # ── LXC/Container normal operations ──
        r'lxc.*monitor',
        r'systemd\[1\]: (started|stopped) .*\.scope',

        # ── ATA/SCSI transient bus errors ──
        # These are logged at ERR level but are common on SATA controllers
        # during hot-plug, link renegotiation, or cable noise. They are NOT
        # indicative of disk failure unless SMART also reports problems.
        # NOTE: patterns are matched against line.lower(), so use lowercase.
        r'ata\d+.*serror.*badcrc',
        r'ata\d+.*emask 0x10.*ata bus error',
        r'failed command: (read|write) fpdma queued',
        r'ata\d+.*hard resetting link',
        r'ata\d+.*link is slow',
        r'ata\d+.*comreset',

        # ── ProxMenux self-referential noise ──
        # The monitor reporting its OWN service failures is circular --
        # it cannot meaningfully alert about itself.
        # NOTE: patterns are matched against line.lower(), so use lowercase.
        r'proxmenux-monitor\.service.*failed',
        r'proxmenux-monitor\.service.*exit-code',
        r'proxmenux-monitor.*failed at step exec',
        r'proxmenux-monitor\.appimage',

        # ── PVE scheduler operational noise ──
        # pvescheduler emits "could not update job state" every minute
        # when a scheduled job reference is stale.  This is cosmetic,
        # not a system problem.
        r'pvescheduler.*could not update job state',
        r'pvescheduler.*no such task',
    ]

    CRITICAL_LOG_KEYWORDS = [
        'out of memory', 'oom_kill', 'kernel panic',
        'filesystem read-only', 'cannot mount',
        'raid.*failed', 'md.*device failed',
        'ext4-fs error', 'xfs.*corruption',
        'lvm activation failed',
        'hardware error', 'mce:',
        'general protection fault',
    ]

    # Segfault is WARNING, not CRITICAL -- only PVE-critical process
    # segfaults are escalated to CRITICAL in _classify_log_severity.
    PVE_CRITICAL_PROCESSES = {
        'pveproxy', 'pvedaemon', 'pvestatd', 'pve-cluster',
        'corosync', 'qemu-system', 'lxc-start', 'ceph-osd',
        'ceph-mon', 'pmxcfs', 'kvm',
    }

    WARNING_LOG_KEYWORDS = [
        'i/o error', 'ata error', 'scsi error',
        'task hung', 'blocked for more than',
        'failed to start', 'service.*failed',
        'disk.*offline', 'disk.*removed',
        'segfault',  # WARNING by default; escalated to CRITICAL only for PVE processes
    ]

    # PVE Critical Services
    PVE_SERVICES = ['pveproxy', 'pvedaemon', 'pvestatd', 'pve-cluster']

    # P2 fix: Pre-compiled regex patterns for performance (avoid re-compiling on every line)
    _BENIGN_RE = None
    _CRITICAL_RE = None
    _WARNING_RE = None

    @classmethod
    def _get_compiled_patterns(cls):
        """Lazily compile regex patterns once"""
        if cls._BENIGN_RE is None:
            cls._BENIGN_RE = re.compile("|".join(cls.BENIGN_ERROR_PATTERNS), re.IGNORECASE)
            cls._CRITICAL_RE = re.compile("|".join(cls.CRITICAL_LOG_KEYWORDS), re.IGNORECASE)
            cls._WARNING_RE = re.compile("|".join(cls.WARNING_LOG_KEYWORDS), re.IGNORECASE)
        return cls._BENIGN_RE, cls._CRITICAL_RE, cls._WARNING_RE

    def __init__(self):
        """Initialize health monitor with state tracking"""
        self.state_history = defaultdict(list)
        self.last_check_times = {}
        self.cached_results = {}
        self.network_baseline = {}
        self.io_error_history = defaultdict(list)
        self.failed_vm_history = set()  # Track VMs that failed to start
        self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0})
        self._unknown_counts = {}  # Track consecutive UNKNOWN cycles per category
        self._last_cleanup_time = 0  # Throttle cleanup_old_errors calls

        # SMART check cache - reduces disk queries from every 5 min to every 30 min
        self._smart_cache = {}  # {disk_name: {'result': 'PASSED', 'time': timestamp}}
        self._SMART_CACHE_TTL = 1620  # 27 min - offset to avoid sync with other processes

        # Journalctl 24h cache - reduces full log reads from every 5 min to every 1 hour
        self._journalctl_24h_cache = {'count': 0, 'time': 0}
        self._JOURNALCTL_24H_CACHE_TTL = 3600  # 1 hour - login attempts aggregate slowly

        # Journalctl 10min cache - shared across checks to avoid duplicate calls
        # Multiple checks (cpu_temp, vms_cts) use the same journalctl query
        self._journalctl_10min_cache = {'output': '', 'time': 0}
        self._JOURNALCTL_10MIN_CACHE_TTL = 60  # 1 minute - fresh enough for health checks

        # Journalctl 1hour cache - for disk health events (SMART warnings, I/O errors)
        self._journalctl_1hour_cache = {'output': '', 'time': 0}
        self._JOURNALCTL_1HOUR_CACHE_TTL = 300  # 5 min cache - disk events don't need real-time

        # System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
        # SMART detection still uses filesystem check on init (lightweight)
        has_smart = os.path.exists('/usr/sbin/smartctl') or os.path.exists('/usr/bin/smartctl')
        self.capabilities = {'has_zfs': False, 'has_lvm': False, 'has_smart': has_smart}

        try:
            health_persistence.cleanup_old_errors()
        except Exception as e:
            print(f"[HealthMonitor] Cleanup warning: {e}")

    def _get_journalctl_10min_warnings(self) -> str:
        """Get journalctl warnings from last 10 minutes, cached to avoid duplicate calls.

        Multiple health checks need the same journalctl data (cpu_temp, vms_cts, etc).
        This method caches the result for 60 seconds to reduce subprocess overhead.
        """
        current_time = time.time()
        cache = self._journalctl_10min_cache

        # Return cached result if fresh
        if cache['output'] and (current_time - cache['time']) < self._JOURNALCTL_10MIN_CACHE_TTL:
            return cache['output']

        # Execute journalctl and cache result
        # Use -b 0 to only include logs from the current boot
        try:
            result = subprocess.run(
                ['journalctl', '-b', '0', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
                timeout=20
            )
            if result.returncode == 0:
                cache['output'] = result.stdout
                cache['time'] = current_time
                return cache['output']
        except subprocess.TimeoutExpired:
            print("[HealthMonitor] journalctl 10min cache: timeout")
        except Exception as e:
            print(f"[HealthMonitor] journalctl 10min cache error: {e}")

        return cache.get('output', '')  # Return stale cache on error

    def _get_journalctl_1hour_warnings(self) -> str:
        """Get journalctl warnings from last 1 hour, cached for disk health checks.

        Used by _check_disk_health_from_events for SMART warnings and I/O errors.
        Cached for 5 minutes since disk events don't require real-time detection.
        """
        current_time = time.time()
        cache = self._journalctl_1hour_cache

        # Return cached result if fresh
        if cache['output'] and (current_time - cache['time']) < self._JOURNALCTL_1HOUR_CACHE_TTL:
            return cache['output']

        # Execute journalctl and cache result
        # Use -b 0 to only include logs from the current boot
        try:
            result = subprocess.run(
                ['journalctl', '-b', '0', '--since', '1 hour ago', '--no-pager', '-p', 'warning',
                 '--output=short-precise'],
                capture_output=True,
                text=True,
                timeout=15
            )
            if result.returncode == 0:
                cache['output'] = result.stdout
                cache['time'] = current_time
                return cache['output']
        except subprocess.TimeoutExpired:
            print("[HealthMonitor] journalctl 1hour cache: timeout")
        except Exception as e:
            print(f"[HealthMonitor] journalctl 1hour cache error: {e}")

        return cache.get('output', '')  # Return stale cache on error

    # ─── Lightweight sampling methods for the dedicated vital-signs thread ───
    # These ONLY append data to state_history without triggering evaluation,
    # persistence, or subprocess-heavy operations.

    def _sample_cpu_usage(self):
        """Lightweight CPU sample: read usage % and append to history. ~30ms cost."""
        try:
            cpu_percent = psutil.cpu_percent(interval=0)
            current_time = time.time()
            state_key = 'cpu_usage'
            self.state_history[state_key].append({
                'value': cpu_percent,
                'time': current_time
            })
            # Prune entries older than 6 minutes
            self.state_history[state_key] = [
                e for e in self.state_history[state_key]
                if current_time - e['time'] < 360
            ]
        except Exception:
            pass  # Sampling must never crash the thread

    def _sample_cpu_temperature(self):
        """Lightweight temperature sample: read sensor and append to history. ~50ms cost."""
        try:
            result = subprocess.run(
                ['sensors', '-A', '-u'],
                capture_output=True, text=True, timeout=2
            )
            if result.returncode != 0:
                return

            temps = []
            for line in result.stdout.split('\n'):
                if 'temp' in line.lower() and '_input' in line:
                    try:
                        temp = float(line.split(':')[1].strip())
                        temps.append(temp)
                    except Exception:
                        continue

            if temps:
                max_temp = max(temps)
                current_time = time.time()
                state_key = 'cpu_temp_history'
                self.state_history[state_key].append({
                    'value': max_temp,
                    'time': current_time
                })
                # Prune entries older than 4 minutes
                self.state_history[state_key] = [
                    e for e in self.state_history[state_key]
                    if current_time - e['time'] < 240
                ]
        except Exception:
            pass  # Sampling must never crash the thread

    def get_system_info(self) -> Dict[str, Any]:
        """
        Get lightweight system info for header display.
        Returns: hostname, uptime, and cached health status.
        This is extremely lightweight and uses cached health status.
        """
        try:
            # Get hostname
            hostname = os.uname().nodename

            # Get uptime (very cheap operation)
            uptime_seconds = time.time() - psutil.boot_time()

            # Get cached health status (no expensive checks)
            health_status = self.get_cached_health_status()

            return {
                'hostname': hostname,
                'uptime_seconds': int(uptime_seconds),
                'uptime': self._format_uptime(uptime_seconds),
                'health': health_status,
                'timestamp': datetime.now().isoformat()
            }
        except Exception as e:
            return {
                'hostname': 'unknown',
                'uptime_seconds': 0,
                'uptime': 'Unknown',
                'health': {'status': 'UNKNOWN', 'summary': f'Error: {str(e)}'},
                'timestamp': datetime.now().isoformat()
            }

    def _format_uptime(self, seconds: float) -> str:
        """Format uptime in human-readable format"""
        days = int(seconds // 86400)
        hours = int((seconds % 86400) // 3600)
        minutes = int((seconds % 3600) // 60)

        if days > 0:
            return f"{days}d {hours}h {minutes}m"
        elif hours > 0:
            return f"{hours}h {minutes}m"
        else:
            return f"{minutes}m"

    def get_cached_health_status(self) -> Dict[str, str]:
        """
        Get cached health status without running expensive checks.
        The background health collector keeps '_bg_overall' always fresh (every 5 min).
        Falls back to calculating on demand if background data is stale or unavailable.
        """
        current_time = time.time()

        # 1. Check background collector cache (updated every 5 min by _health_collector_loop)
        bg_key = '_bg_overall'
        if bg_key in self.last_check_times:
            age = current_time - self.last_check_times[bg_key]
            if age < 360:  # 6 min (5 min interval + 1 min tolerance)
                return self.cached_results.get(bg_key, {'status': 'OK', 'summary': 'System operational'})

        # 2. Check regular cache (updated by modal fetches or on-demand)
        cache_key = 'overall_health'
        if cache_key in self.last_check_times:
            if current_time - self.last_check_times[cache_key] < 60:
                return self.cached_results.get(cache_key, {'status': 'OK', 'summary': 'System operational'})

        # 3. No fresh cache - calculate on demand (happens only on first load before bg thread runs)
        status = self.get_overall_status()
        self.cached_results[cache_key] = {
            'status': status['status'],
            'summary': status['summary']
        }
        self.last_check_times[cache_key] = current_time

        return self.cached_results[cache_key]

    def get_overall_status(self) -> Dict[str, Any]:
        """Get overall health status summary with minimal overhead"""
        details = self.get_detailed_status()

        overall_status = details.get('overall', 'OK')
        summary = details.get('summary', '')

        # Count statuses
        critical_count = 0
        warning_count = 0
        ok_count = 0

        for category, data in details.get('details', {}).items():
            if isinstance(data, dict):
                status = data.get('status', 'OK')
                if status == 'CRITICAL':
                    critical_count += 1
                elif status == 'WARNING':
                    warning_count += 1
                elif status == 'OK':
                    ok_count += 1

        return {
            'status': overall_status,
            'summary': summary,
            'critical_count': critical_count,
            'warning_count': warning_count,
            'ok_count': ok_count,
            'timestamp': datetime.now().isoformat()
        }

    def get_detailed_status(self) -> Dict[str, Any]:
        """
        Get comprehensive health status with all checks.
        Returns JSON structure with ALL 10 categories always present.
        Now includes persistent error tracking.
        """
        # Run cleanup with throttle (every 5 min) so stale errors are auto-resolved
        # using the user-configured Suppression Duration (single source of truth).
        current_time = time.time()
        if current_time - self._last_cleanup_time > 300:  # 5 minutes
            try:
                health_persistence.cleanup_old_errors()
                self._last_cleanup_time = current_time
            except Exception:
                pass

        active_errors = health_persistence.get_active_errors()
        # No need to create persistent_issues dict here, it's implicitly handled by the checks

        details = {
            'cpu': {'status': 'OK'},
            'memory': {'status': 'OK'},
            'storage': {'status': 'OK'}, # This will be overwritten by specific storage checks
            'disks': {'status': 'OK'}, # This will be overwritten by disk/filesystem checks
            'network': {'status': 'OK'},
            'vms': {'status': 'OK'},
            'services': {'status': 'OK'},
            'logs': {'status': 'OK'},
            'updates': {'status': 'OK'},
            'security': {'status': 'OK'}
        }

        critical_issues = []
        warning_issues = []
        info_issues = []  # Added info_issues to track INFO separately

        # --- Priority Order of Checks ---
        _t_total = time.time()  # [PERF] Total health check timing

        # Priority 1: Critical PVE Services
        _t = time.time()
        services_status = self._check_pve_services()
        _perf_log("services", (time.time() - _t) * 1000)
        details['services'] = services_status
        if services_status['status'] == 'CRITICAL':
            critical_issues.append(f"PVE Services: {services_status.get('reason', 'Service failure')}")
        elif services_status['status'] == 'WARNING':
            warning_issues.append(f"PVE Services: {services_status.get('reason', 'Service issue')}")

        # Priority 1.5: Proxmox Storage Check (External Module)
        _t = time.time()
        proxmox_storage_result = self._check_proxmox_storage()
        _perf_log("proxmox_storage", (time.time() - _t) * 1000)
        if proxmox_storage_result: # Only process if the check ran (module available)
            details['storage'] = proxmox_storage_result
            if proxmox_storage_result.get('status') == 'CRITICAL':
                critical_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage unavailable'))
            elif proxmox_storage_result.get('status') == 'WARNING':
                warning_issues.append(proxmox_storage_result.get('reason', 'Proxmox storage issue'))

            # Derive capabilities from Proxmox storage types (immediate, no extra checks)
            storage_checks = proxmox_storage_result.get('checks', {})
            storage_types = {v.get('detail', '').split(' ')[0].lower() for v in storage_checks.values() if isinstance(v, dict)}
            self.capabilities['has_zfs'] = any(t in ('zfspool', 'zfs') for t in storage_types)
            self.capabilities['has_lvm'] = any(t in ('lvm', 'lvmthin') for t in storage_types)

        # Priority 2: Disk/Filesystem Health (Internal checks: usage, ZFS, SMART, IO errors)
        _t = time.time()
        storage_status = self._check_storage_optimized()
        _perf_log("storage_optimized", (time.time() - _t) * 1000)
        details['disks'] = storage_status # Use 'disks' for filesystem/disk specific issues
        if storage_status.get('status') == 'CRITICAL':
            critical_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage failure')}")
        elif storage_status.get('status') == 'WARNING':
            warning_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage issue')}")

        # Priority 3: VMs/CTs Status (with persistence)
        _t = time.time()
        vms_status = self._check_vms_cts_with_persistence()
        _perf_log("vms_cts", (time.time() - _t) * 1000)
        details['vms'] = vms_status
        if vms_status.get('status') == 'CRITICAL':
            critical_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT failure')}")
        elif vms_status.get('status') == 'WARNING':
            warning_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT issue')}")

        # Priority 4: Network Connectivity
        _t = time.time()
        network_status = self._check_network_optimized()
        _perf_log("network", (time.time() - _t) * 1000)
        details['network'] = network_status
        if network_status.get('status') == 'CRITICAL':
            critical_issues.append(f"Network: {network_status.get('reason', 'Network failure')}")
        elif network_status.get('status') == 'WARNING':
            warning_issues.append(f"Network: {network_status.get('reason', 'Network issue')}")

        # Priority 5: CPU Usage (with hysteresis)
        _t = time.time()
        cpu_status = self._check_cpu_with_hysteresis()
        _perf_log("cpu", (time.time() - _t) * 1000)
        details['cpu'] = cpu_status
        if cpu_status.get('status') == 'CRITICAL':
            critical_issues.append(f"CPU: {cpu_status.get('reason', 'CPU critical')}")
        elif cpu_status.get('status') == 'WARNING':
            warning_issues.append(f"CPU: {cpu_status.get('reason', 'CPU high')}")

        # Priority 6: Memory Usage (RAM and Swap)
        _t = time.time()
        memory_status = self._check_memory_comprehensive()
        _perf_log("memory", (time.time() - _t) * 1000)
        details['memory'] = memory_status
        if memory_status.get('status') == 'CRITICAL':
            critical_issues.append(f"Memory: {memory_status.get('reason', 'Memory critical')}")
        elif memory_status.get('status') == 'WARNING':
            warning_issues.append(f"Memory: {memory_status.get('reason', 'Memory high')}")

        # Priority 7: Log Analysis (with persistence)
        _t = time.time()
        logs_status = self._check_logs_with_persistence()
        _perf_log("logs", (time.time() - _t) * 1000)
        details['logs'] = logs_status
        if logs_status.get('status') == 'CRITICAL':
            critical_issues.append(f"Logs: {logs_status.get('reason', 'Critical log errors')}")
        elif logs_status.get('status') == 'WARNING':
            warning_issues.append(f"Logs: {logs_status.get('reason', 'Log warnings')}")

        # Priority 8: System Updates
        _t = time.time()
        updates_status = self._check_updates()
        _perf_log("updates", (time.time() - _t) * 1000)
        details['updates'] = updates_status
        if updates_status.get('status') == 'CRITICAL':
            critical_issues.append(f"Updates: {updates_status.get('reason', 'System not updated')}")
        elif updates_status.get('status') == 'WARNING':
            warning_issues.append(f"Updates: {updates_status.get('reason', 'Updates pending')}")
        elif updates_status.get('status') == 'INFO':
            info_issues.append(f"Updates: {updates_status.get('reason', 'Informational update notice')}")

        # Priority 9: Security Checks
        _t = time.time()
        security_status = self._check_security()
        _perf_log("security", (time.time() - _t) * 1000)
        details['security'] = security_status
        if security_status.get('status') == 'WARNING':
            warning_issues.append(f"Security: {security_status.get('reason', 'Security issue')}")
        elif security_status.get('status') == 'INFO':
            info_issues.append(f"Security: {security_status.get('reason', 'Security information')}")

        # Log total time for all checks
        _perf_log("TOTAL_HEALTH_CHECK", (time.time() - _t_total) * 1000)

        # --- Track UNKNOWN counts and persist if >= 3 consecutive cycles ---
        unknown_issues = []
        for cat_key, cat_data in details.items():
            cat_status = cat_data.get('status', 'OK')
            if cat_status == 'UNKNOWN':
                count = self._unknown_counts.get(cat_key, 0) + 1
                self._unknown_counts[cat_key] = min(count, 10)  # Cap to avoid unbounded growth
                unknown_issues.append(f"{cat_key}: {cat_data.get('reason', 'Check unavailable')}")
                if count == 3:  # Only persist on the exact 3rd cycle, not every cycle after
                    try:
                        health_persistence.record_unknown_persistent(
                            cat_key, cat_data.get('reason', 'Check unavailable'))
                    except Exception:
                        pass
            else:
                self._unknown_counts[cat_key] = 0

        # --- Determine Overall Status ---
        # Severity: CRITICAL > WARNING > UNKNOWN (capped at WARNING) > INFO > OK
        if critical_issues:
            overall = 'CRITICAL'
            summary = '; '.join(critical_issues[:3])
        elif warning_issues:
            overall = 'WARNING'
            summary = '; '.join(warning_issues[:3])
        elif unknown_issues:
            overall = 'WARNING'  # UNKNOWN caps at WARNING, never escalates to CRITICAL
            summary = '; '.join(unknown_issues[:3])
        elif info_issues:
            overall = 'OK'  # INFO statuses don't degrade overall health
            summary = '; '.join(info_issues[:3])
        else:
            overall = 'OK'
            summary = 'All systems operational'

        # --- Emit events for state changes (Bloque A: Notification prep) ---
        try:
            previous_overall = getattr(self, '_last_overall_status', None)
            if previous_overall and previous_overall != overall:
                # Overall status changed - emit event
                health_persistence.emit_event(
                    event_type='state_change',
                    category='overall',
                    severity=overall,
                    data={
                        'previous': previous_overall,
                        'current': overall,
                        'summary': summary
                    }
                )

            # Track per-category state changes
            previous_details = getattr(self, '_last_category_statuses', {})
            for cat_key, cat_data in details.items():
                cat_status = cat_data.get('status', 'OK')
                prev_status = previous_details.get(cat_key, 'OK')
                if prev_status != cat_status and cat_status in ('WARNING', 'CRITICAL'):
                    health_persistence.emit_event(
                        event_type='state_change',
                        category=cat_key,
                        severity=cat_status,
                        data={
                            'previous': prev_status,
                            'current': cat_status,
                            'reason': cat_data.get('reason', '')
                        }
                    )

            self._last_overall_status = overall
            self._last_category_statuses = {k: v.get('status', 'OK') for k, v in details.items()}
        except Exception:
            pass  # Event emission should never break health checks

        return {
            'overall': overall,
            'summary': summary,
            'details': details,
            'timestamp': datetime.now().isoformat()
        }

    def _check_cpu_with_hysteresis(self) -> Dict[str, Any]:
        """Check CPU with hysteresis to avoid flapping alerts - requires 5min sustained high usage"""
        try:
            cpu_percent = psutil.cpu_percent(interval=0.1)  # 100ms sample - sufficient for health check
            current_time = time.time()

            state_key = 'cpu_usage'
            # Add this reading as well (supplements the sampler thread)
            self.state_history[state_key].append({
                'value': cpu_percent,
                'time': current_time
            })

            # Snapshot the list for thread-safe reading (sampler may append concurrently)
            cpu_snapshot = list(self.state_history[state_key])
            # Prune old entries via snapshot replacement (atomic assignment)
            self.state_history[state_key] = [
                entry for entry in cpu_snapshot
                if current_time - entry['time'] < 360
            ]

            critical_samples = [
                entry for entry in self.state_history[state_key]
                if entry['value'] >= self.CPU_CRITICAL and
                current_time - entry['time'] <= self.CPU_CRITICAL_DURATION
            ]

            warning_samples = [
                entry for entry in self.state_history[state_key]
                if entry['value'] >= self.CPU_WARNING and
                current_time - entry['time'] <= self.CPU_WARNING_DURATION
            ]

            recovery_samples = [
                entry for entry in self.state_history[state_key]
                if entry['value'] < self.CPU_RECOVERY and
                current_time - entry['time'] <= self.CPU_RECOVERY_DURATION
            ]

            if len(critical_samples) >= 3:
                status = 'CRITICAL'
                reason = f'CPU >{self.CPU_CRITICAL}% sustained for {self.CPU_CRITICAL_DURATION}s'
            elif len(warning_samples) >= 3 and len(recovery_samples) < 2:
                status = 'WARNING'
                reason = f'CPU >{self.CPU_WARNING}% sustained for {self.CPU_WARNING_DURATION}s'
            else:
                status = 'OK'
                reason = None

            temp_status = self._check_cpu_temperature()

            result = {
                'status': status,
                'usage': round(cpu_percent, 1),
                'cores': psutil.cpu_count()
            }

            if reason:
                result['reason'] = reason

            if temp_status and temp_status.get('status') != 'UNKNOWN':
                result['temperature'] = temp_status
                if temp_status.get('status') == 'CRITICAL':
                    result['status'] = 'CRITICAL'
                    result['reason'] = temp_status.get('reason')
                elif temp_status.get('status') == 'WARNING' and status == 'OK':
                    result['status'] = 'WARNING'
                    result['reason'] = temp_status.get('reason')

            # Build checks dict for frontend expandable section
            checks = {
                'cpu_usage': {
                    'status': status,
                    'detail': 'Sustained high CPU usage' if status != 'OK' else 'Normal'
                }
            }
            if temp_status and temp_status.get('status') != 'UNKNOWN':
                t_status = temp_status.get('status', 'OK')
                checks['cpu_temperature'] = {
                    'status': t_status,
                    'detail': 'Temperature elevated' if t_status != 'OK' else 'Normal'
                }
            else:
                checks['cpu_temperature'] = {
                    'status': 'INFO',
                    'detail': 'No temperature sensor detected - install lm-sensors if hardware supports it',
                }

            result['checks'] = checks
            return result

        except Exception as e:
            return {'status': 'UNKNOWN', 'reason': f'CPU check failed: {str(e)}', 'dismissable': True}

    def _check_cpu_temperature(self) -> Optional[Dict[str, Any]]:
        """
        Check CPU temperature with temporal logic:
        - WARNING if temp >80°C sustained for >3 minutes
        - Auto-clears if temp ≤80°C for 30 seconds
        - No dismiss button (non-dismissable)
        """
        cache_key = 'cpu_temp'
        current_time = time.time()

        # Check every 10 seconds instead of 60
        if cache_key in self.last_check_times:
            if current_time - self.last_check_times[cache_key] < 10:
                return self.cached_results.get(cache_key)

        try:
            # Use shared journalctl cache to avoid duplicate calls
            journalctl_output = self._get_journalctl_10min_warnings()

            if journalctl_output:
                temps = []
                for line in journalctl_output.split('\n'):
                    if 'temp' in line.lower() and '_input' in line:
                        try:
                            temp = float(line.split(':')[1].strip())
                            temps.append(temp)
                        except:
                            continue

                if temps:
                    max_temp = max(temps)

                    state_key = 'cpu_temp_history'
                    # Add this reading (supplements the sampler thread)
                    self.state_history[state_key].append({
                        'value': max_temp,
                        'time': current_time
                    })

                    # Snapshot for thread-safe reading, then atomic prune
                    temp_snapshot = list(self.state_history[state_key])
                    self.state_history[state_key] = [
                        entry for entry in temp_snapshot
                        if current_time - entry['time'] < 240
                    ]

                    # Check if temperature >80°C for more than 3 minutes (180 seconds)
                    high_temp_samples = [
                        entry for entry in self.state_history[state_key]
                        if entry['value'] > 80 and current_time - entry['time'] <= 180
                    ]

                    # Check if temperature ≤80°C for last 30 seconds (recovery)
                    recovery_samples = [
                        entry for entry in self.state_history[state_key]
                        if entry['value'] <= 80 and current_time - entry['time'] <= 30
                    ]

                    # Require at least 18 samples over 3 minutes (one every 10 seconds) to trigger alert
                    if len(high_temp_samples) >= 18:
                        # Temperature has been >80°C for >3 minutes
                        status = 'WARNING'
                        reason = f'CPU temperature {max_temp}°C >80°C sustained >3min'

                        # Record non-dismissable error
                        health_persistence.record_error(
                            error_key='cpu_temperature',
                            category='temperature',
                            severity='WARNING',
                            reason=reason,
                            details={'temperature': max_temp, 'dismissable': False}
                        )
                    elif len(recovery_samples) >= 3:
                        # Temperature has been ≤80°C for 30 seconds - clear the error
                        status = 'OK'
                        reason = None
                        health_persistence.resolve_error('cpu_temperature', 'Temperature recovered')
                    else:
                        # Temperature is elevated but not long enough, or recovering but not yet cleared
                        # Check if we already have an active error
                        if health_persistence.is_error_active('cpu_temperature', category='temperature'):
                            # Keep the warning active
                            status = 'WARNING'
                            reason = f'CPU temperature {max_temp}°C still elevated'
                        else:
                            # No active warning yet
                            status = 'OK'
                            reason = None

                    temp_result = {
                        'status': status,
                        'value': round(max_temp, 1),
                        'unit': '°C'
                    }
                    if reason:
                        temp_result['reason'] = reason

                    self.cached_results[cache_key] = temp_result
                    self.last_check_times[cache_key] = current_time
                    return temp_result

            return None

        except Exception:
            return None

    def _check_memory_comprehensive(self) -> Dict[str, Any]:
        """
        Check memory including RAM and swap with realistic thresholds.
        Only alerts on truly problematic memory situations.
        """
        try:
            memory = psutil.virtual_memory()
            swap = psutil.swap_memory()
            current_time = time.time()

            mem_percent = memory.percent
            swap_percent = swap.percent if swap.total > 0 else 0
            swap_vs_ram = (swap.used / memory.total * 100) if memory.total > 0 else 0

            state_key = 'memory_usage'
            self.state_history[state_key].append({
                'mem_percent': mem_percent,
                'swap_percent': swap_percent,
                'swap_vs_ram': swap_vs_ram,
                'time': current_time
            })

            self.state_history[state_key] = [
                entry for entry in self.state_history[state_key]
                if current_time - entry['time'] < 600
            ]

            mem_critical = sum(
                1 for entry in self.state_history[state_key]
                if entry['mem_percent'] >= 90 and
                current_time - entry['time'] <= self.MEMORY_DURATION
            )

            mem_warning = sum(
                1 for entry in self.state_history[state_key]
                if entry['mem_percent'] >= self.MEMORY_WARNING and
                current_time - entry['time'] <= self.MEMORY_DURATION
            )

            swap_critical = sum(
                1 for entry in self.state_history[state_key]
                if entry['swap_vs_ram'] > 20 and
                current_time - entry['time'] <= self.SWAP_CRITICAL_DURATION
            )


            if mem_critical >= 2:
                status = 'CRITICAL'
                reason = f'RAM >90% for {self.MEMORY_DURATION}s'
            elif swap_critical >= 2:
                status = 'CRITICAL'
                reason = f'Swap >20% of RAM ({swap_vs_ram:.1f}%)'
            elif mem_warning >= 2:
                status = 'WARNING'
                reason = f'RAM >{self.MEMORY_WARNING}% for {self.MEMORY_DURATION}s'
            else:
                status = 'OK'
                reason = None

            ram_avail_gb = round(memory.available / (1024**3), 2)
            ram_total_gb = round(memory.total / (1024**3), 2)
            swap_used_gb = round(swap.used / (1024**3), 2)
            swap_total_gb = round(swap.total / (1024**3), 2)

            # Determine per-sub-check status
            ram_status = 'CRITICAL' if mem_percent >= 90 and mem_critical >= 2 else ('WARNING' if mem_percent >= self.MEMORY_WARNING and mem_warning >= 2 else 'OK')
            swap_status = 'CRITICAL' if swap_critical >= 2 else 'OK'

            result = {
                'status': status,
                'ram_percent': round(mem_percent, 1),
                'ram_available_gb': ram_avail_gb,
                'swap_percent': round(swap_percent, 1),
                'swap_used_gb': swap_used_gb,
                'checks': {
                    'ram_usage': {
                        'status': ram_status,
                        'detail': 'High RAM usage sustained' if ram_status != 'OK' else 'Normal'
                    },
                    'swap_usage': {
                        'status': swap_status,
                        'detail': 'Excessive swap usage' if swap_status != 'OK' else ('Normal' if swap.total > 0 else 'No swap configured')
                    }
                }
            }

            if reason:
                result['reason'] = reason

            return result

        except Exception as e:
            return {'status': 'UNKNOWN', 'reason': f'Memory check failed: {str(e)}', 'dismissable': True}

    def _check_storage_optimized(self) -> Dict[str, Any]:
        """
        Optimized storage check - monitors Proxmox storages from pvesm status.
        Checks for inactive storages, disk health from SMART/events, and ZFS pool health.
        """
        issues = []
        storage_details = {}

        # Check disk usage and mount status for important mounts.
        # We detect actual mountpoints dynamically rather than hard-coding.
        critical_mounts = set()
        critical_mounts.add('/')
        try:
            for part in psutil.disk_partitions(all=False):
                mp = part.mountpoint
                # Include standard system mounts and PVE storage
                if mp in ('/', '/var', '/tmp', '/boot', '/boot/efi') or \
                   mp.startswith('/var/lib/vz') or mp.startswith('/mnt/'):
                    critical_mounts.add(mp)
        except Exception:
            pass
        critical_mounts = sorted(critical_mounts)

        for mount_point in critical_mounts:
            try:
                result = subprocess.run(
                    ['mountpoint', '-q', mount_point],
                    capture_output=True,
                    timeout=2
                )

                if result.returncode != 0:
                    issues.append(f'{mount_point}: Not mounted')
                    storage_details[mount_point] = {
                        'status': 'CRITICAL',
                        'reason': 'Not mounted'
                    }
                    continue

                # Check if read-only
                with open('/proc/mounts', 'r') as f:
                    for line in f:
                        parts = line.split()
                        if len(parts) >= 4 and parts[1] == mount_point:
                            options = parts[3].split(',')
                            if 'ro' in options:
                                issues.append(f'{mount_point}: Mounted read-only')
                                storage_details[mount_point] = {
                                    'status': 'CRITICAL',
                                    'reason': 'Mounted read-only'
                                }
                                break # Found it, no need to check further for this mountpoint

                # Check filesystem usage only if not already flagged as critical
                if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK':
                    fs_status = self._check_filesystem(mount_point)
                    error_key = f'disk_space_{mount_point}'
                    if fs_status['status'] != 'OK':
                        issues.append(f"{mount_point}: {fs_status['reason']}")
                        storage_details[mount_point] = fs_status
                        # Record persistent error for notifications
                        usage = psutil.disk_usage(mount_point)
                        avail_gb = usage.free / (1024**3)
                        if avail_gb >= 1:
                            avail_str = f"{avail_gb:.1f} GiB"
                        else:
                            avail_str = f"{usage.free / (1024**2):.0f} MiB"
                        health_persistence.record_error(
                            error_key=error_key,
                            category='disk',
                            severity=fs_status['status'],
                            reason=f'{mount_point}: {fs_status["reason"]}',
                            details={
                                'mount': mount_point,
                                'used': str(round(usage.percent, 1)),
                                'available': avail_str,
                                'dismissable': False,
                            }
                        )
                    else:
                        # Space recovered -- clear any previous alert
                        health_persistence.clear_error(error_key)
            except Exception:
                pass # Silently skip if mountpoint check fails

        # Check ZFS pool health status
        zfs_pool_issues = self._check_zfs_pool_health()
        if zfs_pool_issues:
            for pool_name, pool_info in zfs_pool_issues.items():
                issues.append(f'{pool_name}: {pool_info["reason"]}')
                storage_details[pool_name] = pool_info

                # Record error for notification system
                real_pool = pool_info.get('pool_name', pool_name)
                zfs_error_key = f'zfs_pool_{real_pool}'
                zfs_reason = f'ZFS pool {real_pool}: {pool_info["reason"]}'
                try:
                    if not health_persistence.is_error_active(zfs_error_key, category='disks'):
                        health_persistence.record_error(
                            error_key=zfs_error_key,
                            category='disks',
                            severity=pool_info.get('status', 'WARNING'),
                            reason=zfs_reason,
                            details={
                                'pool_name': real_pool,
                                'health': pool_info.get('health', ''),
                                'device': f'zpool:{real_pool}',
                                'dismissable': False,
                            }
                        )
                except Exception:
                    pass

                # Record as permanent disk observation
                try:
                    health_persistence.record_disk_observation(
                        device_name=f'zpool_{real_pool}',
                        serial=None,
                        error_type='zfs_pool_error',
                        error_signature=f'zfs_{real_pool}_{pool_info.get("health", "unknown")}',
                        raw_message=zfs_reason,
                        severity=pool_info.get('status', 'WARNING').lower(),
                    )
                except Exception:
                    pass
        else:
            # ZFS pools are healthy -- clear any previously recorded ZFS errors
            if self.capabilities.get('has_zfs'):
                try:
                    active_errors = health_persistence.get_active_errors()
                    for error in active_errors:
                        if error.get('error_key', '').startswith('zfs_pool_'):
                            health_persistence.clear_error(error['error_key'])
                except Exception:
                    pass

        # Check disk health from Proxmox task log or system logs (SMART, etc.)
        disk_health_issues = self._check_disk_health_from_events()
        smart_warnings_found = False
        if disk_health_issues:
            for disk, issue in disk_health_issues.items():
                # Only add if not already covered by critical mountpoint issues
                if disk not in storage_details or storage_details[disk].get('status') == 'OK':
                    issues.append(f'{disk}: {issue["reason"]}')
                    storage_details[disk] = issue

                # Track if any SMART warnings were found (for smart_health sub-check)
                if issue.get('smart_lines'):
                    smart_warnings_found = True

                # Record error with full details for notification system
                # Avoid duplicate: if dmesg I/O errors already cover this disk
                # (disk_{device}), skip the journal SMART notification to prevent
                # the user getting two alerts for the same underlying problem.
                device = issue.get('device', disk.replace('/dev/', ''))
                io_error_key = f'disk_{device}'
                error_key = f'smart_{device}'
                reason = f'{disk}: {issue["reason"]}'
                severity = issue.get('status', 'WARNING')

                # Get serial for this disk to properly track it (important for USB disks)
                disk_serial = ''
                disk_model = ''
                try:
                    smart_result = subprocess.run(
                        ['smartctl', '-i', '-j', f'/dev/{device}'],
                        capture_output=True, text=True, timeout=5
                    )
                    if smart_result.returncode in (0, 4):
                        import json
                        smart_data = json.loads(smart_result.stdout)
                        disk_serial = smart_data.get('serial_number', '')
                        disk_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
                except Exception:
                    pass

                try:
                    if (not health_persistence.is_error_active(io_error_key, category='disks') and
                        not health_persistence.is_error_active(error_key, category='disks')):
                        health_persistence.record_error(
                            error_key=error_key,
                            category='disks',
                            severity=severity,
                            reason=reason,
                            details={
                                'disk': device,
                                'device': disk,
                                'block_device': device,
                                'serial': disk_serial,
                                'model': disk_model,
                                'smart_status': 'WARNING',
                                'smart_lines': issue.get('smart_lines', []),
                                'io_lines': issue.get('io_lines', []),
                                'sample': issue.get('sample', ''),
                                'source': 'journal',
                                'dismissable': True,
                            }
                        )
                    # Register the disk for observation tracking (worst_health no longer used)
                    if disk_serial:
                        health_persistence.register_disk(device, disk_serial, disk_model, 0)
                except Exception:
                    pass

        # Check LVM status
        lvm_status = self._check_lvm()
        if lvm_status.get('status') == 'WARNING':
            # LVM volumes might be okay but indicate potential issues
            issues.append(f"LVM check: {lvm_status.get('reason')}")
            storage_details['lvm_check'] = lvm_status

        # Check dmesg for real-time I/O errors (dmesg-based, complements journalctl SMART checks)
        dmesg_io_result = self._check_disks_optimized()
        if dmesg_io_result.get('status') != 'OK':
            dmesg_details = dmesg_io_result.get('details', {})
            for disk_path, disk_info in dmesg_details.items():
                if disk_path not in storage_details or storage_details[disk_path].get('status') == 'OK':
                    issues.append(f'{disk_path}: {disk_info.get("reason", "I/O errors")}')
                    storage_details[disk_path] = disk_info

                device = disk_path.replace('/dev/', '')
                io_severity = disk_info.get('status', 'WARNING').lower()

                # Get serial for proper disk tracking (important for USB)
                io_serial = ''
                io_model = ''
                try:
                    smart_result = subprocess.run(
                        ['smartctl', '-i', '-j', f'/dev/{device}'],
                        capture_output=True, text=True, timeout=5
                    )
                    if smart_result.returncode in (0, 4):
                        import json
                        smart_data = json.loads(smart_result.stdout)
                        io_serial = smart_data.get('serial_number', '')
                        io_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
                except Exception:
                    pass

                # Register the disk for observation tracking (worst_health no longer used)
                try:
                    if io_serial:
                        health_persistence.register_disk(device, io_serial, io_model, 0)
                except Exception:
                    pass

        # Build checks dict from storage_details
        # We consolidate disk error entries (like /Dev/Sda) into physical disk entries
        # and only show disks with problems (not healthy ones).
        checks = {}
        disk_errors_by_device = {}  # Collect disk errors for consolidation

        for key, val in storage_details.items():
            # Check if this is a disk device entry (e.g., /Dev/Sda, /dev/sda, sda)
            key_lower = key.lower()
            is_disk_entry = (
                key_lower.startswith('/dev/') or
                key_lower.startswith('dev/') or
                (len(key_lower) <= 10 and (key_lower.startswith('sd') or
                 key_lower.startswith('nvme') or key_lower.startswith('hd')))
            )

            if is_disk_entry:
                # Extract device name and collect for consolidation
                device_name = key_lower.replace('/dev/', '').replace('dev/', '').strip('/')
                if device_name and len(device_name) <= 15:
                    if device_name not in disk_errors_by_device:
                        disk_errors_by_device[device_name] = {
                            'status': val.get('status', 'WARNING'),
                            'detail': val.get('reason', ''),
                            'error_key': val.get('error_key'),
                            'dismissable': val.get('dismissable', True),
                        }
                    else:
                        # Merge: keep worst status
                        existing = disk_errors_by_device[device_name]
                        if val.get('status') == 'CRITICAL':
                            existing['status'] = 'CRITICAL'
                        # Append detail if different - with smart deduplication
                        new_detail = val.get('reason', '')
                        existing_detail = existing.get('detail', '')
                        if new_detail and new_detail not in existing_detail:
                            # Check for semantic duplicates by extracting key info
                            # Extract device references and key metrics from both
                            new_parts = set(p.strip() for p in new_detail.replace(';', '\n').split('\n') if p.strip())
                            existing_parts = set(p.strip() for p in existing_detail.replace(';', '\n').split('\n') if p.strip())

                            # Find truly new information (parts not already present)
                            unique_new_parts = []
                            for part in new_parts:
                                is_duplicate = False
                                # Check if this part's core content exists in any existing part
                                part_lower = part.lower()
                                for ex_part in existing_parts:
                                    ex_lower = ex_part.lower()
                                    # If >60% of words overlap, consider it duplicate
                                    part_words = set(part_lower.split())
                                    ex_words = set(ex_lower.split())
                                    if part_words and ex_words:
                                        overlap = len(part_words & ex_words) / min(len(part_words), len(ex_words))
                                        if overlap > 0.6:
                                            is_duplicate = True
                                            break
                                if not is_duplicate:
                                    unique_new_parts.append(part)

                            # Only append truly unique parts
                            if unique_new_parts:
                                unique_text = '; '.join(unique_new_parts)
                                existing['detail'] = f"{existing_detail}; {unique_text}".strip('; ')
                    continue  # Don't add raw disk error entry, we'll add consolidated later

            # Non-disk entries go directly to checks
            checks[key] = {
                'status': val.get('status', 'OK'),
                'detail': val.get('reason', 'OK'),
                **{k: v for k, v in val.items() if k not in ('status', 'reason')}
            }

        # Get physical disk info for matching errors to disks
        # This uses the same detection as flask_server.py /api/storage/info
        physical_disks = {}
        try:
            result = subprocess.run(
                ['lsblk', '-b', '-d', '-n', '-o', 'NAME,SIZE,TYPE,TRAN'],
                capture_output=True, text=True, timeout=5
            )
            if result.returncode == 0:
                for line in result.stdout.strip().split('\n'):
                    if not line.strip():
                        continue
                    parts = line.split()
                    if len(parts) >= 3 and parts[2] == 'disk':
                        disk_name = parts[0]
                        # Skip virtual devices
                        if disk_name.startswith(('zd', 'zram', 'loop', 'ram', 'dm-')):
                            continue
                        tran = parts[3].upper() if len(parts) > 3 else ''
                        is_usb = tran == 'USB'
                        is_nvme = disk_name.startswith('nvme')

                        # Get serial from smartctl
                        serial = ''
                        model = ''
                        try:
                            smart_result = subprocess.run(
                                ['smartctl', '-i', '-j', f'/dev/{disk_name}'],
                                capture_output=True, text=True, timeout=5
                            )
                            if smart_result.returncode in (0, 4):  # 4 = SMART not available but info OK
                                import json
                                smart_data = json.loads(smart_result.stdout)
                                serial = smart_data.get('serial_number', '')
                                model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
                        except Exception:
                            pass

                        physical_disks[disk_name] = {
                            'serial': serial,
                            'model': model,
                            'is_usb': is_usb,
                            'is_nvme': is_nvme,
                            'disk_type': 'USB' if is_usb else ('NVMe' if is_nvme else 'SATA'),
                        }
        except Exception:
            pass

        # Check disk_observations for active (non-dismissed) warnings
        # This ensures disks with persistent observations appear in Health Monitor
        # even if the error is not currently in the logs
        try:
            all_observations = health_persistence.get_disk_observations()
            for obs in all_observations:
                device_name = obs.get('device_name', '').replace('/dev/', '')
                if not device_name:
                    continue
                severity = (obs.get('severity') or 'warning').upper()
                # Only include if WARNING/CRITICAL and not already dismissed
                if severity in ('WARNING', 'CRITICAL') and not obs.get('dismissed'):
                    # Check if there's a corresponding acknowledged error in the errors table
                    # If so, skip this observation (it was dismissed via Health Monitor)
                    error_key = f"disk_smart_{device_name}"
                    error_record = health_persistence.get_error_by_key(error_key)
                    if error_record and error_record.get('acknowledged'):
                        continue  # Skip - this was dismissed

                    # Add to disk_errors_by_device if not already present
                    if device_name not in disk_errors_by_device:
                        obs_reason = obs.get('raw_message', f'{device_name}: Disk observation recorded')
                        disk_errors_by_device[device_name] = {
                            'status': severity,
                            'reason': obs_reason,
                            'error_type': obs.get('error_type', 'disk_observation'),
                            'serial': obs.get('serial', ''),
                            'model': obs.get('model', ''),
                            'dismissable': True,
                        }
        except Exception:
            pass

        # Add consolidated disk entries (only for disks with errors)
        for device_name, error_info in disk_errors_by_device.items():
            # Try to find this disk in physical_disks for enriched info
            disk_info = physical_disks.get(device_name, {})

            # If not found by name, try to match by serial (from error details)
            if not disk_info:
                error_serial = error_info.get('serial', '')
                if error_serial:
                    for dk, di in physical_disks.items():
                        if di.get('serial', '').lower() == error_serial.lower():
                            disk_info = di
                            device_name = dk  # Update device name to matched disk
                            break

            # Determine disk type
            disk_type = disk_info.get('disk_type', 'SATA')
            if not disk_info:
                # Fallback detection
                if device_name.startswith('nvme'):
                    disk_type = 'NVMe'
                else:
                    # Check if USB via sysfs
                    try:
                        usb_check = subprocess.run(
                            ['readlink', '-f', f'/sys/block/{device_name}'],
                            capture_output=True, text=True, timeout=2
                        )
                        if 'usb' in usb_check.stdout.lower():
                            disk_type = 'USB'
                    except Exception:
                        pass

            serial = disk_info.get('serial', '')
            model = disk_info.get('model', '')

            # Use current status directly from Proxmox/SMART - no persistent worst_health
            # Historical observations are preserved separately in disk_observations table
            current_status = error_info.get('status', 'WARNING')
            final_status = current_status

            # Build detail string with serial/model if available
            detail = error_info.get('detail', error_info.get('reason', 'Unknown error'))
            if serial and serial not in detail:
                detail = f"{serial} - {detail}"

            # Create consolidated disk entry
            check_key = f'/dev/{device_name}'
            checks[check_key] = {
                'status': final_status,
                'detail': detail,
                'disk_type': disk_type,
                'device': f'/dev/{device_name}',
                'serial': serial,
                'model': model,
                'error_key': error_info.get('error_key') or f'disk_smart_{device_name}',
                'dismissable': error_info.get('dismissable', True),
                'is_disk_entry': True,
            }

            # Add to issues array if WARNING or CRITICAL (ensures category status is correct)
            if final_status in ('WARNING', 'CRITICAL'):
                issue_msg = f'{check_key}: {detail}'
                if issue_msg not in issues:
                    issues.append(issue_msg)

            # Register disk in persistence if not already (for worst_health tracking)
            try:
                health_persistence.register_disk(device_name, serial if serial else None, model, 0)
            except Exception:
                pass

        # ALWAYS add descriptive entries for capabilities this server has.
        # When everything is OK, they show as OK.  When there are issues,
        # they still appear so the user can see the full picture (e.g.
        # LVM is OK even though I/O errors exist on a disk).
        if 'root_filesystem' not in checks:
            checks['root_filesystem'] = checks.pop('/', None) or {'status': 'OK', 'detail': 'Mounted read-write, space OK'}
        if 'io_errors' not in checks:
            # Only add OK if no disk I/O errors are present in checks
            has_io = any(v.get('error_count') or 'I/O' in str(v.get('detail', '')) for v in checks.values())
            if not has_io:
                checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'}
        if self.capabilities.get('has_smart') and 'smart_health' not in checks:
            if not smart_warnings_found:
                checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
            # When smart_warnings_found is True, the per-disk sub-checks
            # (/Dev/Sda etc.) already carry all the detail and dismiss logic.
            # Adding a separate smart_health WARNING would just duplicate them.
        if self.capabilities.get('has_zfs') and 'zfs_pools' not in checks:
            checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'}
        if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks:
            checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}

        if not issues:
            return {'status': 'OK', 'checks': checks}

        # ── Mark dismissed checks ──
        # If an error_key in a check has been acknowledged (dismissed) in the
        # persistence DB, mark the check as dismissed so the frontend renders
        # it in blue instead of showing WARNING + Dismiss button.
        # Also recalculate category status: if ALL warning/critical checks are
        # dismissed, downgrade the category to OK.
        try:
            all_dismissed = True
            for check_key, check_val in checks.items():
                ek = check_val.get('error_key')
                if not ek:
                    continue
                check_status = (check_val.get('status') or 'OK').upper()
                if check_status in ('WARNING', 'CRITICAL'):
                    if health_persistence.is_error_acknowledged(ek):
                        check_val['dismissed'] = True
                    else:
                        all_dismissed = False

            # If every non-OK check is dismissed, downgrade the category
            non_ok_checks = [v for v in checks.values()
                             if (v.get('status') or 'OK').upper() in ('WARNING', 'CRITICAL')]
            if non_ok_checks and all(v.get('dismissed') for v in non_ok_checks):
                # All issues are dismissed -- category shows as OK to avoid
                # persistent WARNING after user has acknowledged.
                return {
                    'status': 'OK',
                    'reason': '; '.join(issues[:3]),
                    'details': storage_details,
                    'checks': checks,
                    'all_dismissed': True,
                }
        except Exception:
            pass

        # Determine overall status
        has_critical = any(
            d.get('status') == 'CRITICAL' for d in storage_details.values()
        )

        return {
            'status': 'CRITICAL' if has_critical else 'WARNING',
            'reason': '; '.join(issues[:3]),
            'details': storage_details,
            'checks': checks
        }

    def _check_filesystem(self, mount_point: str) -> Dict[str, Any]:
        """Check individual filesystem for space and mount status"""
        try:
            usage = psutil.disk_usage(mount_point)
            percent = usage.percent

            if percent >= self.STORAGE_CRITICAL:
                status = 'CRITICAL'
                reason = f'{percent:.1f}% full (≥{self.STORAGE_CRITICAL}%)'
            elif percent >= self.STORAGE_WARNING:
                status = 'WARNING'
                reason = f'{percent:.1f}% full (≥{self.STORAGE_WARNING}%)'
            else:
                status = 'OK'
                reason = None

            result = {
                'status': status,
                'usage_percent': round(percent, 1)
            }

            if reason:
                result['reason'] = reason

            return result

        except Exception as e:
            return {
                'status': 'WARNING',
                'reason': f'Check failed: {str(e)}'
            }

    def _check_lvm(self) -> Dict[str, Any]:
        """Check LVM volumes - improved detection"""
        try:
            # Check if lvs command is available
            result_which = subprocess.run(
                ['which', 'lvs'],
                capture_output=True,
                text=True,
                timeout=1
            )
            if result_which.returncode != 0:
                return {'status': 'OK'} # LVM not installed

            result = subprocess.run(
                ['lvs', '--noheadings', '--options', 'lv_name,vg_name,lv_attr'],
                capture_output=True,
                text=True,
                timeout=3
            )

            if result.returncode != 0:
                return {'status': 'WARNING', 'reason': 'lvs command failed'}

            volumes = []
            for line in result.stdout.strip().split('\n'):
                if line.strip():
                    parts = line.split()
                    if len(parts) >= 2:
                        lv_name = parts[0].strip()
                        vg_name = parts[1].strip()
                        # Check for 'a' attribute indicating active/available
                        if 'a' in parts[2]:
                            volumes.append(f'{vg_name}/{lv_name}')

            # If LVM is configured but no active volumes are found, it might be an issue or just not used
            if not volumes:
                # Check if any VGs exist to determine if LVM is truly unconfigured or just inactive
                vg_result = subprocess.run(
                    ['vgs', '--noheadings', '--options', 'vg_name'],
                    capture_output=True,
                    text=True,
                    timeout=3
                )
                if vg_result.returncode == 0 and vg_result.stdout.strip():
                    return {'status': 'WARNING', 'reason': 'No active LVM volumes detected'}
                else:
                    return {'status': 'OK'} # No VGs found, LVM not in use

            return {'status': 'OK', 'volumes': len(volumes)}

        except Exception:
            return {'status': 'OK'}

    # This function is no longer used in get_detailed_status, but kept for reference if needed.
    # The new _check_proxmox_storage function handles this logic better.
    def _check_proxmox_storages(self) -> Dict[str, Any]:
        """Check Proxmox-specific storages (only report problems)"""
        storages = {}

        try:
            if os.path.exists('/etc/pve/storage.cfg'):
                with open('/etc/pve/storage.cfg', 'r') as f:
                    current_storage = None
                    storage_type = None

                    for line in f:
                        line = line.strip()

                        if line.startswith('dir:') or line.startswith('nfs:') or \
                           line.startswith('cifs:') or line.startswith('pbs:') or \
                           line.startswith('rbd:') or line.startswith('cephfs:') or \
                           line.startswith('zfs:') or line.startswith('zfs-send:'):
                            parts = line.split(':', 1)
                            storage_type = parts[0]
                            current_storage = parts[1].strip()
                        elif line.startswith('path ') and current_storage:
                            path = line.split(None, 1)[1]

                            if storage_type == 'dir':
                                if not os.path.exists(path):
                                    storages[f'storage_{current_storage}'] = {
                                        'status': 'CRITICAL',
                                        'reason': 'Directory does not exist',
                                        'type': 'dir',
                                        'path': path
                                    }

                            current_storage = None
                            storage_type = None
        except Exception:
            pass

        return storages

    @staticmethod
    def _make_io_obs_signature(disk: str, sample: str) -> str:
        """Create a stable observation signature for I/O errors on a disk.

        All ATA errors on the same disk (exception Emask, revalidation failed,
        hard resetting link, SError, etc.) map to ONE signature per error family.
        This ensures that "Emask 0x1 SAct 0xc1000000" and "Emask 0x1 SAct 0x804000"
        and "revalidation failed" all dedup into the same observation.
        """
        if not sample:
            return f'io_{disk}_generic'

        s = sample.lower()

        # Classify into error families (order matters: first match wins)
        families = [
            # ATA controller errors: exception, emask, revalidation, reset
            # All these are symptoms of the same underlying connection issue
            (r'exception\s+emask|emask\s+0x|revalidation failed|hard resetting link|'
             r'serror.*badcrc|comreset|link is slow|status.*drdy',
             'ata_connection_error'),
            # SCSI / block-layer errors
            (r'i/o error|blk_update_request|medium error|sense key',
             'block_io_error'),
            # Failed commands (READ/WRITE FPDMA QUEUED)
            (r'failed command|fpdma queued',
             'ata_failed_command'),
        ]

        for pattern, family in families:
            if re.search(pattern, s):
                return f'io_{disk}_{family}'

        # Fallback: generic per-disk
        return f'io_{disk}_generic'

    def _resolve_ata_to_disk(self, ata_port: str) -> str:
        """Resolve an ATA controller name (e.g. 'ata8') to a block device (e.g. 'sda').

        Uses /sys/class/ata_port/ symlinks and /sys/block/ to find the mapping.
        Falls back to parsing dmesg for 'ata8: SATA link up' -> 'sd 7:0:0:0: [sda]'.
        """
        if not ata_port or not ata_port.startswith('ata'):
            return ata_port

        port_num = ata_port.replace('ata', '')

        # Method 1: Walk /sys/class/ata_port/ -> host -> target -> block
        try:
            ata_path = f'/sys/class/ata_port/{ata_port}'
            if os.path.exists(ata_path):
                device_path = os.path.realpath(ata_path)
                # Walk up to find the SCSI host, then find block devices
                # Path: /sys/devices/.../ataX/hostY/targetY:0:0/Y:0:0:0/block/sdZ
                for root, dirs, files in os.walk(os.path.dirname(device_path)):
                    if 'block' in dirs:
                        block_path = os.path.join(root, 'block')
                        devs = os.listdir(block_path)
                        if devs:
                            return devs[0]  # e.g. 'sda'
        except (OSError, IOError):
            pass

        # Method 2: Parse dmesg for ATA link messages
        try:
            result = subprocess.run(
                ['dmesg', '--notime'],
                capture_output=True, text=True, timeout=2
            )
            if result.returncode == 0:
                # Look for "ata8: SATA link up" followed by "sd X:0:0:0: [sda]"
                lines = result.stdout.split('\n')
                host_num = None
                for line in lines:
                    m = re.search(rf'{ata_port}:\s+SATA link', line)
                    if m:
                        # ata port number maps to host(N-1) typically
                        host_num = int(port_num) - 1
                    if host_num is not None:
                        m2 = re.search(rf'sd\s+{host_num}:\d+:\d+:\d+:\s+\[(\w+)\]', line)
                        if m2:
                            return m2.group(1)
        except (OSError, subprocess.TimeoutExpired):
            pass

        # Method 3: Use /sys/block/sd* and trace back to ATA host number
        # ata8 => host7 (N-1) or host8 depending on controller numbering
        try:
            for sd in sorted(os.listdir('/sys/block')):
                if not sd.startswith('sd'):
                    continue
                # /sys/block/sdX/device -> ../../hostN/targetN:0:0/N:0:0:0
                dev_link = f'/sys/block/{sd}/device'
                if os.path.islink(dev_link):
                    real_path = os.path.realpath(dev_link)
                    # Check if 'ataX' appears in the device path
                    if f'/{ata_port}/' in real_path or f'/ata{port_num}/' in real_path:
                        return sd
                    # Also check host number mapping: ata8 -> host7 (N-1 convention)
                    for offset in (0, -1):
                        host_n = int(port_num) + offset
                        if host_n >= 0 and f'/host{host_n}/' in real_path:
                            # Verify: check if ataX appears in the chain
                            parent = real_path
                            while parent and parent != '/':
                                parent = os.path.dirname(parent)
                                if os.path.basename(parent) == ata_port:
                                    return sd
                                # Check 1 level: /sys/devices/.../ataX/hostY/...
                                ata_check = os.path.join(os.path.dirname(parent), ata_port)
                                if os.path.exists(ata_check):
                                    return sd
        except (OSError, IOError, ValueError):
            pass

        return ata_port  # Return original if resolution fails

    def _identify_block_device(self, device: str) -> str:
        """
        Identify a block device by querying lsblk.
        Returns a human-readable string like:
          "KINGSTON SA400S37960G (SSD, 894.3G) mounted at /mnt/data"
        Returns empty string if the device is not found in lsblk.
        """
        if not device or device == 'unknown':
            return ''
        try:
            candidates = [device]
            base = re.sub(r'\d+$', '', device) if not ('nvme' in device or 'mmcblk' in device) else device
            if base != device:
                candidates.append(base)

            for dev in candidates:
                dev_path = f'/dev/{dev}' if not dev.startswith('/') else dev
                result = subprocess.run(
                    ['lsblk', '-ndo', 'NAME,MODEL,SIZE,TRAN,MOUNTPOINT,ROTA', dev_path],
                    capture_output=True, text=True, timeout=3
                )
                if result.returncode == 0 and result.stdout.strip():
                    fields = result.stdout.strip().split(None, 5)
                    name = fields[0] if len(fields) > 0 else dev
                    model = fields[1] if len(fields) > 1 and fields[1] else 'Unknown model'
                    size = fields[2] if len(fields) > 2 else '?'
                    tran = (fields[3] if len(fields) > 3 else '').upper()
                    mountpoint = fields[4] if len(fields) > 4 and fields[4] else ''
                    rota = fields[5].strip() if len(fields) > 5 else '1'

                    if tran == 'USB':
                        disk_type = 'USB'
                    elif tran == 'NVME' or 'nvme' in name:
                        disk_type = 'NVMe'
                    elif rota == '0':
                        disk_type = 'SSD'
                    else:
                        disk_type = 'HDD'

                    info = f'{model} ({disk_type}, {size})'
                    if mountpoint:
                        info += f' mounted at {mountpoint}'
                    elif dev != device:
                        part_result = subprocess.run(
                            ['lsblk', '-ndo', 'MOUNTPOINT', f'/dev/{device}'],
                            capture_output=True, text=True, timeout=2
                        )
                        part_mount = part_result.stdout.strip() if part_result.returncode == 0 else ''
                        if part_mount:
                            info += f' partition {device} mounted at {part_mount}'
                        else:
                            info += ' -- not mounted'
                    else:
                        info += ' -- not mounted'
                    return info
            return ''
        except Exception:
            return ''

    def _quick_smart_health(self, disk_name: str) -> str:
        """Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'.

        Results are cached for 30 minutes to reduce disk queries - SMART status rarely changes.
        """
        if not disk_name or disk_name.startswith('ata') or disk_name.startswith('zram'):
            return 'UNKNOWN'

        # Check cache first
        current_time = time.time()
        cache_key = disk_name
        cached = self._smart_cache.get(cache_key)
        if cached and current_time - cached['time'] < self._SMART_CACHE_TTL:
            return cached['result']

        try:
            dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name
            result = subprocess.run(
                ['smartctl', '--health', '-j', dev_path],
                capture_output=True, text=True, timeout=5
            )
            import json as _json
            data = _json.loads(result.stdout)
            passed = data.get('smart_status', {}).get('passed', None)
            if passed is True:
                smart_result = 'PASSED'
            elif passed is False:
                smart_result = 'FAILED'
            else:
                smart_result = 'UNKNOWN'

            # Cache the result
            self._smart_cache[cache_key] = {'result': smart_result, 'time': current_time}
            return smart_result
        except Exception:
            return 'UNKNOWN'

    def _check_all_disks_smart(self, fallback: str = 'UNKNOWN') -> str:
        """Check SMART health of ALL physical disks.

        Used when an ATA port can't be resolved to a specific /dev/sdX.
        If ALL disks report PASSED, returns 'PASSED' (errors are transient).
        If ANY disk reports FAILED, returns 'FAILED'.
        Otherwise returns the fallback value.
        """
        try:
            # List all block devices (exclude partitions, loop, zram, dm)
            result = subprocess.run(
                ['lsblk', '-dnpo', 'NAME,TYPE'],
                capture_output=True, text=True, timeout=3
            )
            if result.returncode != 0:
                return fallback

            disks = []
            for line in result.stdout.strip().split('\n'):
                parts = line.split()
                if len(parts) >= 2 and parts[1] == 'disk':
                    disks.append(parts[0])  # e.g. /dev/sda

            if not disks:
                return fallback

            all_passed = True
            any_failed = False
            checked = 0

            for dev in disks:
                health = self._quick_smart_health(dev)
                if health == 'PASSED':
                    checked += 1
                elif health == 'FAILED':
                    any_failed = True
                    break
                else:
                    all_passed = False  # Can't confirm this disk

            if any_failed:
                return 'FAILED'
            if all_passed and checked > 0:
                return 'PASSED'
            return fallback
        except Exception:
            return fallback

    def _check_disks_optimized(self) -> Dict[str, Any]:
        """
        Disk I/O error check -- the SINGLE source of truth for disk errors.

        Reads dmesg for I/O/ATA/SCSI errors, counts per device, records in
        health_persistence, and returns status for the health dashboard.
        Resolves ATA controller names (ata8) to physical disks (sda).

        Cross-references SMART health to avoid false positives from transient
        ATA controller errors. If SMART reports PASSED, dmesg errors are
        downgraded to INFO (transient).
        """
        current_time = time.time()
        disk_results = {}  # Single dict for both WARNING and CRITICAL

        # Common transient ATA patterns that auto-recover and are not real disk failures.
        # These are bus/controller level events, NOT media errors:
        #   action 0x0 = no action needed (fully recovered)
        #   action 0x6 = hard reset + port reinit (common cable/connector recovery)
        #   SError with BadCRC/Dispar = signal integrity issue (cable, not disk)
        #   Emask 0x10 = ATA bus error (controller/interconnect, not media)
        TRANSIENT_PATTERNS = [
            re.compile(r'exception\s+emask.*action\s+0x[06]', re.IGNORECASE),
            re.compile(r'serror.*=.*0x[0-9a-f]+\s*\(', re.IGNORECASE),
            re.compile(r'SError:.*\{.*(?:BadCRC|Dispar|CommWake).*\}', re.IGNORECASE),
            re.compile(r'emask\s+0x10\s+\(ATA bus error\)', re.IGNORECASE),
            re.compile(r'failed command:\s*READ FPDMA QUEUED', re.IGNORECASE),
        ]

        try:
            # Check dmesg for I/O errors in the last 5 minutes
            result = subprocess.run(
                ['dmesg', '-T', '--level=err,warn', '--since', '5 minutes ago'],
                capture_output=True,
                text=True,
                timeout=2
            )

            # Collect a sample line per device for richer error messages
            disk_samples = {}
            # Track if ALL errors for a device are transient patterns
            disk_transient_only = {}

            if result.returncode == 0:
                for line in result.stdout.split('\n'):
                    line_lower = line.lower()
                    # Detect various disk error formats
                    is_disk_error = any(kw in line_lower for kw in [
                        'i/o error', 'scsi error', 'medium error',
                        'failed command:', 'exception emask',
                    ])
                    ata_match = re.search(r'(ata\d+)[\.\d]*:.*(?:error|failed|exception)', line_lower)
                    if ata_match:
                        is_disk_error = True

                    if is_disk_error:
                        # Check if this specific line is a known transient pattern
                        is_transient = any(p.search(line) for p in TRANSIENT_PATTERNS)

                        # Extract device from multiple formats
                        raw_device = None
                        for dev_re in [
                            r'dev\s+(sd[a-z]+)',          # dev sdb
                            r'\[(sd[a-z]+)\]',            # [sda]
                            r'/dev/(sd[a-z]+)',            # /dev/sda
                            r'(nvme\d+n\d+)',             # nvme0n1
                            r'device\s+(sd[a-z]+\d*)',    # device sda1
                            r'(ata\d+)',                  # ata8 (ATA controller)
                        ]:
                            dm = re.search(dev_re, line)
                            if dm:
                                raw_device = dm.group(1)
                                break

                        if raw_device:
                            # Resolve ATA port to physical disk name
                            if raw_device.startswith('ata'):
                                resolved = self._resolve_ata_to_disk(raw_device)
                                disk_name = resolved
                            else:
                                disk_name = raw_device.rstrip('0123456789') if raw_device.startswith('sd') else raw_device

                            self.io_error_history[disk_name].append(current_time)
                            if disk_name not in disk_samples:
                                clean = re.sub(r'^\[.*?\]\s*', '', line.strip())
                                disk_samples[disk_name] = clean[:200]

                            # Track transient status: if ANY non-transient error is found, mark False
                            if disk_name not in disk_transient_only:
                                disk_transient_only[disk_name] = is_transient
                            elif not is_transient:
                                disk_transient_only[disk_name] = False

                # Clean old history and evaluate per-disk status
                for disk in list(self.io_error_history.keys()):
                    self.io_error_history[disk] = [
                        t for t in self.io_error_history[disk]
                        if current_time - t < 300
                    ]

                    error_count = len(self.io_error_history[disk])
                    error_key = f'disk_{disk}'
                    sample = disk_samples.get(disk, '')
                    display = f'/dev/{disk}' if not disk.startswith('/') else disk
                    all_transient = disk_transient_only.get(disk, False)

                    if error_count >= 1:
                        # Cross-reference with SMART to determine real severity
                        smart_health = self._quick_smart_health(disk)

                        # If SMART is UNKNOWN (unresolved ATA port), check ALL
                        # physical disks.  If every disk passes SMART, the ATA
                        # errors are transient bus/controller noise.
                        if smart_health == 'UNKNOWN':
                            smart_health = self._check_all_disks_smart(smart_health)

                        smart_ok = smart_health == 'PASSED'

                        # Resolve ATA name to block device early so we can use it
                        # in both record_error details AND record_disk_observation.
                        resolved_block = disk
                        resolved_serial = None
                        if disk.startswith('ata'):
                            resolved_block = self._resolve_ata_to_disk(disk)
                            # Get serial from the resolved device
                            try:
                                dev_path = f'/dev/{resolved_block}' if resolved_block != disk else None
                                if dev_path:
                                    sm = subprocess.run(
                                        ['smartctl', '-i', dev_path],
                                        capture_output=True, text=True, timeout=3)
                                    if sm.returncode in (0, 4):
                                        for sline in sm.stdout.split('\n'):
                                            if 'Serial Number' in sline or 'Serial number' in sline:
                                                resolved_serial = sline.split(':')[-1].strip()
                                                break
                            except Exception:
                                pass
                        else:
                            try:
                                sm = subprocess.run(
                                    ['smartctl', '-i', f'/dev/{disk}'],
                                    capture_output=True, text=True, timeout=3)
                                if sm.returncode in (0, 4):
                                    for sline in sm.stdout.split('\n'):
                                        if 'Serial Number' in sline or 'Serial number' in sline:
                                            resolved_serial = sline.split(':')[-1].strip()
                                            break
                            except Exception:
                                pass

                        # ── Record disk observation (always, even if transient) ──
                        # Signature must be stable across cycles: strip volatile
                        # data (hex values, counts, timestamps) to dedup properly.
                        # e.g. "ata8.00: exception Emask 0x1 SAct 0xc1000000"
                        # and  "ata8.00: revalidation failed (errno=-2)"
                        # both map to the same per-device I/O observation.
                        try:
                            obs_sig = self._make_io_obs_signature(disk, sample)
                            obs_severity = 'critical' if smart_health == 'FAILED' else 'warning'
                            health_persistence.record_disk_observation(
                                device_name=resolved_block,
                                serial=resolved_serial,
                                error_type='io_error',
                                error_signature=obs_sig,
                                raw_message=f'{display}: {error_count} I/O event(s) in 5 min (SMART: {smart_health})\n{sample}',
                                severity=obs_severity,
                            )
                        except Exception:
                            pass

                        # Transient-only errors (e.g. SError with auto-recovery)
                        # are always INFO regardless of SMART
                        if all_transient:
                            reason = f'{display}: {error_count} transient ATA event(s) in 5 min (auto-recovered)'
                            if sample:
                                reason += f'\n{sample}'
                            health_persistence.resolve_error(error_key, 'Transient ATA events, auto-recovered')
                            disk_results[display] = {
                                'status': 'INFO',
                                'reason': reason,
                                'device': disk,
                                'error_count': error_count,
                                'smart_status': smart_health,
                                'dismissable': False,
                                'error_key': error_key,
                            }
                        elif smart_ok:
                            # SMART is healthy -> dmesg errors are informational only
                            # The disk is fine; these are transient controller/bus events
                            reason = f'{display}: {error_count} I/O event(s) in 5 min (SMART: OK)'
                            if sample:
                                reason += f'\n{sample}'

                            # Resolve any previous error since SMART confirms disk is healthy
                            health_persistence.resolve_error(error_key, 'SMART healthy, I/O events are transient')

                            disk_results[display] = {
                                'status': 'INFO',
                                'reason': reason,
                                'device': disk,
                                'error_count': error_count,
                                'smart_status': smart_health,
                                'dismissable': False,
                                'error_key': error_key,
                            }
                        elif smart_health == 'FAILED':
                            # SMART confirms a real disk failure
                            severity = 'CRITICAL'
                            reason = f'{display}: {error_count} I/O error(s) in 5 min (SMART: FAILED)'
                            if sample:
                                reason += f'\n{sample}'

                            health_persistence.record_error(
                                error_key=error_key,
                                category='disks',
                                severity=severity,
                                reason=reason,
                                details={'disk': disk, 'device': display,
                                         'block_device': resolved_block,
                                         'serial': resolved_serial or '',
                                         'error_count': error_count,
                                         'smart_status': smart_health,
                                         'sample': sample, 'dismissable': False}
                            )
                            disk_results[display] = {
                                'status': severity,
                                'reason': reason,
                                'device': disk,
                                'error_count': error_count,
                                'smart_status': smart_health,
                                'dismissable': False,
                                'error_key': error_key,
                            }
                        else:
                            # SMART is genuinely UNKNOWN (no disk resolved, no
                            # smartctl at all) -- treat as WARNING, not CRITICAL.
                            # These are likely transient and will auto-resolve.
                            severity = 'WARNING'
                            reason = f'{display}: {error_count} I/O event(s) in 5 min (SMART: unavailable)'
                            if sample:
                                reason += f'\n{sample}'

                            # Only record to persistence ONCE.  If the error is
                            # already active, don't call record_error again --
                            # that would keep updating last_seen and preventing
                            # the freshness check from detecting it as stale.
                            if not health_persistence.is_error_active(error_key, category='disks'):
                                health_persistence.record_error(
                                    error_key=error_key,
                                    category='disks',
                                    severity=severity,
                                    reason=reason,
                                    details={'disk': disk, 'device': display,
                                             'block_device': resolved_block,
                                             'serial': resolved_serial or '',
                                             'error_count': error_count,
                                             'smart_status': smart_health,
                                             'sample': sample, 'dismissable': True}
                                )

                            disk_results[display] = {
                                'status': severity,
                                'reason': reason,
                                'device': disk,
                                'error_count': error_count,
                                'smart_status': smart_health,
                                'dismissable': True,
                                'error_key': error_key,
                            }
                    else:
                        health_persistence.resolve_error(error_key, 'Disk errors cleared')

            # Also include active filesystem errors (detected by _check_log_analysis
            # and cross-referenced to the 'disks' category)
            try:
                fs_errors = health_persistence.get_active_errors(category='disks')
                for err in fs_errors:
                    err_key = err.get('error_key', '')
                    if not err_key.startswith('disk_fs_'):
                        continue  # Only filesystem cross-references

                    # Skip acknowledged/dismissed errors
                    if err.get('acknowledged') == 1:
                        continue

                    details = err.get('details', {})
                    if isinstance(details, str):
                        try:
                            import json as _json
                            details = _json.loads(details)
                        except Exception:
                            details = {}

                    device = details.get('device', err_key.replace('disk_fs_', '/dev/'))
                    base_disk = details.get('disk', '')

                    # Check if the device still exists.  If not, auto-resolve
                    # the error -- it was likely a disconnected USB/temp device.
                    dev_path = f'/dev/{base_disk}' if base_disk else device

                    # Also extract base disk from partition (e.g., sdb1 -> sdb)
                    if not base_disk and device:
                        # Remove /dev/ prefix and partition number
                        dev_name = device.replace('/dev/', '')
                        base_disk = re.sub(r'\d+$', '', dev_name)  # sdb1 -> sdb
                        if base_disk:
                            dev_path = f'/dev/{base_disk}'

                    # Check both the specific device and the base disk
                    device_exists = os.path.exists(dev_path)
                    if not device_exists and device and device != dev_path:
                        device_exists = os.path.exists(device)

                    if not device_exists:
                        health_persistence.resolve_error(
                            err_key, 'Device no longer present in system')
                        continue

                    # Cross-reference with SMART: if SMART is healthy for
                    # this disk, downgrade to INFO (transient fs error).
                    severity = err.get('severity', 'WARNING')
                    if base_disk:
                        smart_health = self._quick_smart_health(base_disk)
                        if smart_health == 'PASSED' and severity == 'CRITICAL':
                            severity = 'WARNING'

                    if device not in disk_results:
                        disk_results[device] = {
                            'status': severity,
                            'reason': err.get('reason', 'Filesystem error'),
                            'device': base_disk,
                            'error_count': 1,
                            'error_type': 'filesystem',
                            'dismissable': True,
                            'error_key': err_key,
                        }
            except Exception:
                pass

            if not disk_results:
                return {'status': 'OK'}

            # Overall status: only count WARNING+ (skip INFO)
            active_results = {k: v for k, v in disk_results.items() if v.get('status') not in ('OK', 'INFO')}
            if not active_results:
                return {
                    'status': 'OK',
                    'reason': 'Transient ATA events only (SMART healthy)',
                    'details': disk_results
                }

            has_critical = any(d.get('status') == 'CRITICAL' for d in active_results.values())

            return {
                'status': 'CRITICAL' if has_critical else 'WARNING',
                'reason': f"{len(active_results)} disk(s) with errors",
                'details': disk_results
            }

        except Exception as e:
            print(f"[HealthMonitor] Disk/IO check failed: {e}")
            return {'status': 'UNKNOWN', 'reason': f'Disk check unavailable: {str(e)}', 'checks': {}, 'dismissable': True}

    def _check_network_optimized(self) -> Dict[str, Any]:
        """
        Optimized network check - only alerts for interfaces that are actually in use.
        Avoids false positives for unused physical interfaces.
        Respects interface exclusions configured by the user.
        """
        try:
            issues = []
            interface_details = {}

            net_if_stats = psutil.net_if_stats()

            try:
                net_io_per_nic = psutil.net_io_counters(pernic=True)
            except Exception:
                net_io_per_nic = {}

            try:
                net_if_addrs = psutil.net_if_addrs()
            except Exception:
                net_if_addrs = {}

            # Get excluded interfaces (for health checks)
            excluded_interfaces = health_persistence.get_excluded_interface_names('health')

            active_interfaces = set()

            for interface, stats in net_if_stats.items():
                if interface == 'lo':
                    continue

                # Skip excluded interfaces
                if interface in excluded_interfaces:
                    interface_details[interface] = {
                        'status': 'EXCLUDED',
                        'reason': 'Excluded from monitoring',
                        'is_up': stats.isup,
                        'dismissable': True
                    }
                    continue

                # Check if important interface is down
                if not stats.isup:
                    should_alert = False
                    alert_reason = None

                    # Check if it's a bridge interface (always important for VMs/LXCs)
                    if interface.startswith('vmbr'):
                        should_alert = True
                        alert_reason = 'Bridge interface DOWN (VMs/LXCs may be affected)'

                    # Check if physical interface has configuration or traffic
                    elif interface.startswith(('eth', 'ens', 'enp', 'eno')):
                        # Check if interface has IP address (configured)
                        has_ip = False
                        if interface in net_if_addrs:
                            for addr in net_if_addrs[interface]:
                                if addr.family == 2:  # IPv4
                                    has_ip = True
                                    break

                        # Check if interface has traffic (has been used)
                        has_traffic = False
                        if interface in net_io_per_nic:
                            io_stats = net_io_per_nic[interface]
                            # If interface has sent or received any data, it's being used
                            if io_stats.bytes_sent > 0 or io_stats.bytes_recv > 0:
                                has_traffic = True

                        # Only alert if interface is configured or has been used
                        if has_ip:
                            should_alert = True
                            alert_reason = 'Configured interface DOWN (has IP address)'
                        elif has_traffic:
                            should_alert = True
                            alert_reason = 'Active interface DOWN (was handling traffic)'

                    if should_alert:
                        issues.append(f'{interface} is DOWN')

                        error_key = interface
                        health_persistence.record_error(
                            error_key=error_key,
                            category='network',
                            severity='CRITICAL',
                            reason=alert_reason or 'Interface DOWN',
                            details={'interface': interface, 'dismissable': False}
                        )

                        interface_details[interface] = {
                            'status': 'CRITICAL',
                            'reason': alert_reason or 'Interface DOWN',
                            'dismissable': False
                        }
                else:
                    active_interfaces.add(interface)
                    if interface.startswith('vmbr') or interface.startswith(('eth', 'ens', 'enp', 'eno')):
                        health_persistence.resolve_error(interface, 'Interface recovered')

            # Check connectivity (latency) - reads from gateway monitor database
            latency_status = self._check_network_latency()
            connectivity_check = {'status': 'OK', 'detail': 'Not tested'}
            if latency_status:
                latency_ms = latency_status.get('latency_ms', 'N/A')
                latency_sev = latency_status.get('status', 'OK')
                interface_details['connectivity'] = latency_status
                detail_text = f'Latency {latency_ms}ms to gateway' if isinstance(latency_ms, (int, float)) else latency_status.get('reason', 'Unknown')
                connectivity_check = {
                    'status': latency_sev if latency_sev not in ['UNKNOWN'] else 'OK',
                    'detail': detail_text,
                }
                if latency_sev not in ['OK', 'INFO', 'UNKNOWN']:
                    issues.append(latency_status.get('reason', 'Network latency issue'))

            # Build checks dict
            checks = {}
            for iface in active_interfaces:
                checks[iface] = {'status': 'OK', 'detail': 'UP'}
            for iface, detail in interface_details.items():
                if iface != 'connectivity':
                    checks[iface] = {
                        'status': detail.get('status', 'OK'),
                        'detail': detail.get('reason', 'DOWN'),
                        'dismissable': detail.get('dismissable', False)
                    }
            checks['connectivity'] = connectivity_check

            if not issues:
                return {'status': 'OK', 'checks': checks}

            has_critical = any(d.get('status') == 'CRITICAL' for d in interface_details.values())

            return {
                'status': 'CRITICAL' if has_critical else 'WARNING',
                'reason': '; '.join(issues[:2]),
                'details': interface_details,
                'checks': checks
            }

        except Exception as e:
            print(f"[HealthMonitor] Network check failed: {e}")
            return {'status': 'UNKNOWN', 'reason': f'Network check unavailable: {str(e)}', 'checks': {}, 'dismissable': True}

    def _check_network_latency(self) -> Optional[Dict[str, Any]]:
        """Check network latency by reading from the gateway latency monitor database.

        Reads the most recent gateway latency measurement from the SQLite database
        that is updated every 60 seconds by the latency monitor thread.
        This avoids redundant ping operations and uses the existing monitoring data.
        """
        cache_key = 'network_latency'
        current_time = time.time()

        if cache_key in self.last_check_times:
            if current_time - self.last_check_times[cache_key] < 60:
                return self.cached_results.get(cache_key)

        try:
            import sqlite3
            db_path = "/usr/local/share/proxmenux/monitor.db"

            # Check if database exists
            if not os.path.exists(db_path):
                return {'status': 'UNKNOWN', 'reason': 'Latency monitor database not available', 'dismissable': True}

            conn = sqlite3.connect(db_path, timeout=5)
            cursor = conn.execute(
                """SELECT latency_avg, latency_min, latency_max, packet_loss, timestamp
                   FROM latency_history
                   WHERE target = 'gateway'
                   ORDER BY timestamp DESC
                   LIMIT 1"""
            )
            row = cursor.fetchone()
            conn.close()

            if row and row[0] is not None:
                avg_latency = row[0]
                min_latency = row[1]
                max_latency = row[2]
                packet_loss = row[3] or 0
                data_age = current_time - row[4]

                # If data is older than 2 minutes, consider it stale
                if data_age > 120:
                    stale_result = {
                        'status': 'UNKNOWN',
                        'reason': 'Latency data is stale (>2 min old)'
                    }
                    self.cached_results[cache_key] = stale_result
                    self.last_check_times[cache_key] = current_time
                    return stale_result

                # Check for packet loss first
                if packet_loss >= 100:
                    loss_result = {
                        'status': 'CRITICAL',
                        'reason': 'Packet loss to gateway (100% loss)',
                        'latency_ms': None,
                        'packet_loss': packet_loss
                    }
                    self.cached_results[cache_key] = loss_result
                    self.last_check_times[cache_key] = current_time
                    return loss_result

                # Evaluate latency thresholds
                # During startup grace period, downgrade CRITICAL/WARNING to INFO
                # to avoid false alerts from transient boot-time latency spikes
                in_grace_period = _is_startup_health_grace()

                if avg_latency > self.NETWORK_LATENCY_CRITICAL:
                    if in_grace_period:
                        status = 'INFO'
                        reason = f'Latency {avg_latency:.1f}ms (startup grace, will stabilize)'
                    else:
                        status = 'CRITICAL'
                        reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_CRITICAL}ms'
                elif avg_latency > self.NETWORK_LATENCY_WARNING:
                    if in_grace_period:
                        status = 'INFO'
                        reason = f'Latency {avg_latency:.1f}ms (startup grace, will stabilize)'
                    else:
                        status = 'WARNING'
                        reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_WARNING}ms'
                else:
                    status = 'OK'
                    reason = None

                latency_result = {
                    'status': status,
                    'latency_ms': round(avg_latency, 1),
                    'latency_min': round(min_latency, 1) if min_latency else None,
                    'latency_max': round(max_latency, 1) if max_latency else None,
                    'packet_loss': packet_loss,
                }
                if reason:
                    latency_result['reason'] = reason

                self.cached_results[cache_key] = latency_result
                self.last_check_times[cache_key] = current_time
                return latency_result

            # No data in database yet
            no_data_result = {
                'status': 'UNKNOWN',
                'reason': 'No gateway latency data available yet'
            }
            self.cached_results[cache_key] = no_data_result
            self.last_check_times[cache_key] = current_time
            return no_data_result

        except Exception as e:
            return {'status': 'UNKNOWN', 'reason': f'Latency check failed: {str(e)}', 'dismissable': True}

    def _is_vzdump_active(self) -> bool:
        """Check if a vzdump (backup) job is currently running."""
        try:
            with open('/var/log/pve/tasks/active', 'r') as f:
                for line in f:
                    if ':vzdump:' in line:
                        return True
        except (OSError, IOError):
            pass
        return False

    def _resolve_vm_name(self, vmid: str) -> str:
        """Resolve VMID to guest name from PVE config files."""
        if not vmid:
            return ''
        for base in ['/etc/pve/qemu-server', '/etc/pve/lxc']:
            conf = os.path.join(base, f'{vmid}.conf')
            try:
                with open(conf) as f:
                    for line in f:
                        if line.startswith('hostname:') or line.startswith('name:'):
                            return line.split(':', 1)[1].strip()
            except (OSError, IOError):
                continue
        return ''

    def _vm_ct_exists(self, vmid: str) -> bool:
        """Check if a VM or CT exists by verifying its config file."""
        import os
        # Check VM config
        vm_conf = f'/etc/pve/qemu-server/{vmid}.conf'
        if os.path.exists(vm_conf):
            return True
        # Check CT config (local node and cluster nodes)
        for base in ['/etc/pve/lxc', '/etc/pve/nodes']:
            if base == '/etc/pve/lxc':
                ct_conf = f'{base}/{vmid}.conf'
                if os.path.exists(ct_conf):
                    return True
            else:
                # Check all cluster nodes
                if os.path.isdir(base):
                    for node in os.listdir(base):
                        ct_conf = f'{base}/{node}/lxc/{vmid}.conf'
                        if os.path.exists(ct_conf):
                            return True
        return False

    def _is_vm_running(self, vmid: str) -> bool:
        """Check if a VM or CT is currently running."""
        import subprocess
        try:
            # Check VM status
            result = subprocess.run(
                ['qm', 'status', vmid],
                capture_output=True, text=True, timeout=2
            )
            if result.returncode == 0 and 'running' in result.stdout.lower():
                return True
            # Check CT status
            result = subprocess.run(
                ['pct', 'status', vmid],
                capture_output=True, text=True, timeout=2
            )
            if result.returncode == 0 and 'running' in result.stdout.lower():
                return True
        except Exception:
            pass
        return False

    def _check_vms_cts_optimized(self) -> Dict[str, Any]:
        """
        Optimized VM/CT check - detects qmp failures and startup errors from logs.
        Improved detection of container and VM errors from journalctl.
        """
        try:
            # First: auto-resolve any persisted VM/CT errors where the guest
            # is now running.  This clears stale "Failed to start" / QMP
            # errors that are no longer relevant.
            try:
                active_vm_errors = health_persistence.get_active_errors('vms')
                for err in active_vm_errors:
                    details = err.get('details') or {}
                    vmid = details.get('id', '')
                    if vmid:
                        health_persistence.check_vm_running(vmid)
            except Exception:
                pass

            issues = []
            vm_details = {}

            # Use shared journalctl cache to avoid duplicate calls
            journalctl_output = self._get_journalctl_10min_warnings()

            # Check if vzdump is running -- QMP timeouts during backup are normal
            _vzdump_running = self._is_vzdump_active()

            if journalctl_output:
                for line in journalctl_output.split('\n'):
                    line_lower = line.lower()

                    vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower)
                    if vm_qmp_match:
                        if _vzdump_running:
                            continue  # Normal during backup
                        vmid = vm_qmp_match.group(1)
                        # Skip if VM no longer exists (stale journal entry)
                        if not self._vm_ct_exists(vmid):
                            continue
                        # Skip if VM is now running - the QMP error is stale/resolved
                        # This prevents re-detecting old journal entries after VM recovery
                        if self._is_vm_running(vmid):
                            # Auto-resolve any existing error for this VM
                            health_persistence.check_vm_running(vmid)
                            continue
                        vm_name = self._resolve_vm_name(vmid)
                        display = f"VM {vmid} ({vm_name})" if vm_name else f"VM {vmid}"
                        key = f'vm_{vmid}'
                        if key not in vm_details:
                            issues.append(f'{display}: QMP communication issue')
                            vm_details[key] = {
                                'status': 'WARNING',
                                'reason': f'{display}: QMP command failed or timed out.\n{line.strip()[:200]}',
                                'id': vmid,
                                'vmname': vm_name,
                                'type': 'VM'
                            }
                        continue

                    ct_error_match = re.search(r'(?:ct|container|lxc)\s+(\d+)', line_lower)
                    if ct_error_match and ('error' in line_lower or 'fail' in line_lower or 'device' in line_lower):
                        ctid = ct_error_match.group(1)
                        # Skip if CT no longer exists (stale journal entry)
                        if not self._vm_ct_exists(ctid):
                            continue
                        key = f'ct_{ctid}'
                        if key not in vm_details:
                            if 'device' in line_lower and 'does not exist' in line_lower:
                                device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower)
                                if device_match:
                                    reason = f'Device {device_match.group(1)} missing'
                                else:
                                    reason = 'Device error'
                            elif 'failed to start' in line_lower:
                                reason = 'Failed to start'
                            else:
                                reason = 'Container error'

                            ct_name = self._resolve_vm_name(ctid)
                            display = f"CT {ctid} ({ct_name})" if ct_name else f"CT {ctid}"
                            full_reason = f'{display}: {reason}\n{line.strip()[:200]}'
                            issues.append(f'{display}: {reason}')
                            vm_details[key] = {
                                'status': 'WARNING' if 'device' in reason.lower() else 'CRITICAL',
                                'reason': full_reason,
                                'id': ctid,
                                'vmname': ct_name,
                                'type': 'CT'
                            }
                        continue

                    vzstart_match = re.search(r'vzstart:(\d+):', line)
                    if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower):
                        ctid = vzstart_match.group(1)
                        # Skip if CT no longer exists (stale journal entry)
                        if not self._vm_ct_exists(ctid):
                            continue
                        key = f'ct_{ctid}'
                        if key not in vm_details:
                            # Resolve CT name for better context
                            ct_name = self._resolve_vm_name(ctid)
                            ct_display = f"CT {ctid} ({ct_name})" if ct_name else f"CT {ctid}"

                            # Extract specific error reason
                            if 'device' in line_lower and 'does not exist' in line_lower:
                                device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower)
                                if device_match:
                                    error_detail = f'Device {device_match.group(1)} missing'
                                else:
                                    error_detail = 'Device error'
                            else:
                                error_detail = 'Startup error'

                            # Include CT ID in reason for clarity in notifications
                            reason = f'{ct_display}: {error_detail}'
                            issues.append(reason)
                            vm_details[key] = {
                                'status': 'WARNING',
                                'reason': reason,
                                'id': ctid,
                                'vmname': ct_name,
                                'type': 'CT'
                            }
                        continue

                    if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']):
                        id_match = re.search(r'\b(\d{3,4})\b', line)
                        if id_match:
                            vmid = id_match.group(1)
                            # Skip if VM/CT no longer exists (stale journal entry)
                            if not self._vm_ct_exists(vmid):
                                continue
                            key = f'vmct_{vmid}'
                            if key not in vm_details:
                                vm_name = self._resolve_vm_name(vmid)
                                display = f"VM/CT {vmid} ({vm_name})" if vm_name else f"VM/CT {vmid}"
                                full_reason = f'{display}: Failed to start\n{line.strip()[:200]}'
                                issues.append(f'{display}: Failed to start')
                                vm_details[key] = {
                                    'status': 'CRITICAL',
                                    'reason': full_reason,
                                    'id': vmid,
                                    'vmname': vm_name,
                                    'type': 'VM/CT'
                                }

            if not issues:
                return {'status': 'OK'}

            has_critical = any(d.get('status') == 'CRITICAL' for d in vm_details.values())

            return {
                'status': 'CRITICAL' if has_critical else 'WARNING',
                'reason': '; '.join(issues[:3]),
                'details': vm_details
            }

        except Exception as e:
            print(f"[HealthMonitor] VMs/CTs check failed: {e}")
            return {'status': 'UNKNOWN', 'reason': f'VM/CT check unavailable: {str(e)}', 'checks': {}, 'dismissable': True}

    # Modified to use persistence
    def _check_vms_cts_with_persistence(self) -> Dict[str, Any]:
        """
        Check VMs/CTs with persistent error tracking.
        Errors persist until VM starts or 48h elapsed.
        """
        try:
            issues = []
            vm_details = {}

            # Get active (non-dismissed) errors
            persistent_errors = health_persistence.get_active_errors('vms')

            # Also get dismissed errors to show them as INFO
            dismissed_errors = health_persistence.get_dismissed_errors()
            dismissed_vm_errors = [e for e in dismissed_errors if e.get('category') == 'vms']

            # Process active errors
            for error in persistent_errors:
                error_key = error['error_key']

                if error_key.startswith(('vm_', 'ct_', 'vmct_')):
                    vm_id = error_key.split('_', 1)[1]
                    # Check if VM is running using persistence helper
                    if health_persistence.check_vm_running(vm_id):
                        continue  # Error auto-resolved if VM is now running

                # Still active, add to details
                vm_details[error_key] = {
                    'status': error['severity'],
                    'reason': error['reason'],
                    'id': error.get('details', {}).get('id', 'unknown'),
                    'type': error.get('details', {}).get('type', 'VM/CT'),
                    'first_seen': error['first_seen'],
                    'dismissed': False,
                }
                issues.append(f"{error.get('details', {}).get('type', 'VM')} {error.get('details', {}).get('id', '')}: {error['reason']}")

            # Process dismissed errors (show as INFO)
            for error in dismissed_vm_errors:
                error_key = error['error_key']
                if error_key not in vm_details:  # Don't overwrite active errors
                    vm_details[error_key] = {
                        'status': 'INFO',
                        'reason': error['reason'],
                        'id': error.get('details', {}).get('id', 'unknown'),
                        'type': error.get('details', {}).get('type', 'VM/CT'),
                        'first_seen': error['first_seen'],
                        'dismissed': True,
                    }

            # Check for new errors in logs
            # Using shared journalctl cache to avoid duplicate calls
            journalctl_output = self._get_journalctl_10min_warnings()

            _vzdump_running = self._is_vzdump_active()

            if journalctl_output:
                for line in journalctl_output.split('\n'):
                    line_lower = line.lower()

                    # VM QMP errors (skip during active backup -- normal behavior)
                    vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower)
                    if vm_qmp_match:
                        if _vzdump_running:
                            continue  # Normal during backup
                        vmid = vm_qmp_match.group(1)

                        # Skip if VM no longer exists (deleted after error occurred)
                        if not self._vm_ct_exists(vmid):
                            continue

                        # Skip if VM is now running - the QMP error is stale/resolved
                        # This prevents re-detecting old journal entries after VM recovery
                        if self._is_vm_running(vmid):
                            # Auto-resolve any existing error for this VM
                            health_persistence.check_vm_running(vmid)
                            continue

                        vm_name = self._resolve_vm_name(vmid)
                        display = f"VM {vmid} ({vm_name})" if vm_name else f"VM {vmid}"
                        error_key = f'vm_{vmid}'
                        if error_key not in vm_details:
                            rec_result = health_persistence.record_error(
                                error_key=error_key,
                                category='vms',
                                severity='WARNING',
                                reason=f'{display}: QMP command failed or timed out.\n{line.strip()[:200]}',
                                details={'id': vmid, 'vmname': vm_name, 'type': 'VM'}
                            )
                            if not rec_result or rec_result.get('type') != 'skipped_acknowledged':
                                issues.append(f'{display}: QMP communication issue')
                                vm_details[error_key] = {
                                    'status': 'WARNING',
                                    'reason': f'{display}: QMP command failed or timed out',
                                    'id': vmid,
                                    'vmname': vm_name,
                                    'type': 'VM'
                                }
                        continue

                    # Container errors (including startup issues via vzstart)
                    vzstart_match = re.search(r'vzstart:(\d+):', line)
                    if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower):
                        ctid = vzstart_match.group(1)

                        # Skip if CT no longer exists (deleted after error occurred)
                        if not self._vm_ct_exists(ctid):
                            continue

                        error_key = f'ct_{ctid}'

                        if error_key not in vm_details:
                            # Resolve CT name for better context
                            ct_name = self._resolve_vm_name(ctid)
                            ct_display = f"CT {ctid} ({ct_name})" if ct_name else f"CT {ctid}"

                            if 'device' in line_lower and 'does not exist' in line_lower:
                                device_match = re.search(r'device\s+([/\w\d]+)\s+does not exist', line_lower)
                                if device_match:
                                    error_detail = f'Device {device_match.group(1)} missing'
                                else:
                                    error_detail = 'Device error'
                            else:
                                error_detail = 'Startup error'

                            # Include CT ID in reason for clarity
                            reason = f'{ct_display}: {error_detail}'

                            # Record persistent error
                            rec_result = health_persistence.record_error(
                                error_key=error_key,
                                category='vms',
                                severity='WARNING',
                                reason=reason,
                                details={'id': ctid, 'vmname': ct_name, 'type': 'CT'}
                            )
                            if not rec_result or rec_result.get('type') != 'skipped_acknowledged':
                                issues.append(reason)
                                vm_details[error_key] = {
                                    'status': 'WARNING',
                                    'reason': reason,
                                    'id': ctid,
                                    'vmname': ct_name,
                                    'type': 'CT'
                                }

                    # Generic failed to start for VMs and CTs
                    if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']):
                        # Try contextual VMID patterns first (more precise), then fallback to generic
                        id_match = (
                            re.search(r'(?:VMID|vmid|VM|CT|qemu|lxc|pct|qm)[:\s=/]+(\d{3,5})\b', line) or
                            re.search(r'\b(\d{3,5})\.conf\b', line) or
                            re.search(r'\b(\d{3,5})\b', line)
                        )
                        if id_match:
                            vmid_ctid = id_match.group(1)
                            # Determine if it's a VM or CT based on context, if possible
                            if 'vm' in line_lower or 'qemu' in line_lower:
                                error_key = f'vm_{vmid_ctid}'
                                vm_type = 'VM'
                            elif 'ct' in line_lower or 'lxc' in line_lower:
                                error_key = f'ct_{vmid_ctid}'
                                vm_type = 'CT'
                            else:
                                # Fallback if type is unclear
                                error_key = f'vmct_{vmid_ctid}'
                                vm_type = 'VM/CT'

                            if error_key not in vm_details:
                                vm_name = self._resolve_vm_name(vmid_ctid)
                                display = f"{vm_type} {vmid_ctid}"
                                if vm_name:
                                    display = f"{vm_type} {vmid_ctid} ({vm_name})"
                                reason = f'{display}: Failed to start\n{line.strip()[:200]}'
                                # Record persistent error
                                rec_result = health_persistence.record_error(
                                    error_key=error_key,
                                    category='vms',
                                    severity='CRITICAL',
                                    reason=reason,
                                    details={'id': vmid_ctid, 'vmname': vm_name, 'type': vm_type}
                                )
                                if not rec_result or rec_result.get('type') != 'skipped_acknowledged':
                                    issues.append(f'{display}: Failed to start')
                                    vm_details[error_key] = {
                                        'status': 'CRITICAL',
                                        'reason': reason,
                                        'id': vmid_ctid,
                                        'vmname': vm_name,
                                        'type': vm_type
                                    }

            # Build checks dict from vm_details
            # 'key' is the persistence error_key (e.g. 'qmp_110', 'ct_101', 'vm_110')
            checks = {}
            for key, val in vm_details.items():
                vm_label = f"{val.get('type', 'VM')} {val.get('id', key)}"
                is_dismissed = val.get('dismissed', False)
                checks[vm_label] = {
                    'status': 'INFO' if is_dismissed else val.get('status', 'WARNING'),
                    'detail': val.get('reason', 'Error'),
                    'dismissable': True,
                    'dismissed': is_dismissed,
                    'error_key': key  # Must match the persistence DB key
                }

            if not issues:
                # No active (non-dismissed) issues
                if not checks:
                    checks['qmp_communication'] = {'status': 'OK', 'detail': 'No QMP timeouts detected'}
                    checks['container_startup'] = {'status': 'OK', 'detail': 'No container startup errors'}
                    checks['vm_startup'] = {'status': 'OK', 'detail': 'No VM startup failures'}
                    checks['oom_killer'] = {'status': 'OK', 'detail': 'No OOM events detected'}
                return {'status': 'OK', 'checks': checks}

            # Only consider non-dismissed items for overall severity
            active_details = {k: v for k, v in vm_details.items() if not v.get('dismissed')}
            has_critical = any(d.get('status') == 'CRITICAL' for d in active_details.values())

            return {
                'status': 'CRITICAL' if has_critical else 'WARNING',
                'reason': '; '.join(issues[:3]),
                'details': vm_details,
                'checks': checks
            }

        except Exception as e:
            print(f"[HealthMonitor] VMs/CTs persistence check failed: {e}")
            return {'status': 'UNKNOWN', 'reason': f'VM/CT check unavailable: {str(e)}', 'checks': {}, 'dismissable': True}

    def _check_pve_services(self) -> Dict[str, Any]:
        """
        Check critical Proxmox services with persistence tracking.
        - Checks the base PVE_SERVICES list
        - Dynamically adds corosync if a cluster config exists
        - Records failed services in persistence for tracking/dismiss
        - Auto-clears when services recover
        """
        try:
            # Build service list: base PVE services + corosync if clustered
            services_to_check = list(self.PVE_SERVICES)
            is_cluster = os.path.exists('/etc/corosync/corosync.conf')
            if is_cluster and 'corosync' not in services_to_check:
                services_to_check.append('corosync')

            failed_services = []
            service_details = {}

            for service in services_to_check:
                try:
                    result = subprocess.run(
                        ['systemctl', 'is-active', service],
                        capture_output=True,
                        text=True,
                        timeout=2
                    )

                    status = result.stdout.strip()
                    if result.returncode != 0 or status != 'active':
                        failed_services.append(service)
                        service_details[service] = status or 'inactive'
                except Exception:
                    failed_services.append(service)
                    service_details[service] = 'error'

            # Build checks dict with status per service
            checks = {}
            for svc in services_to_check:
                error_key = f'pve_service_{svc}'
                if svc in failed_services:
                    state = service_details.get(svc, 'inactive')
                    checks[svc] = {
                        'status': 'CRITICAL',
                        'detail': f'Service is {state}',
                        'error_key': error_key,
                        'dismissable': True,
                    }
                else:
                    checks[svc] = {
                        'status': 'OK',
                        'detail': 'Active',
                        'error_key': error_key,
                    }

            if is_cluster:
                checks['cluster_mode'] = {
                    'status': 'OK',
                    'detail': 'Cluster detected (corosync.conf present)',
                }

            if failed_services:
                reason = f'Services inactive: {", ".join(failed_services)}'

                # Record each failed service in persistence, respecting dismiss
                active_failed = []
                for svc in failed_services:
                    error_key = f'pve_service_{svc}'
                    rec_result = health_persistence.record_error(
                        error_key=error_key,
                        category='pve_services',
                        severity='CRITICAL',
                        reason=f'PVE service {svc} is {service_details.get(svc, "inactive")}',
                        details={'service': svc, 'state': service_details.get(svc, 'inactive')}
                    )
                    if rec_result and rec_result.get('type') == 'skipped_acknowledged':
                        # Mark as dismissed in checks for frontend
                        if svc in checks:
                            checks[svc]['dismissed'] = True
                    else:
                        active_failed.append(svc)

                # Auto-clear services that recovered
                for svc in services_to_check:
                    if svc not in failed_services:
                        error_key = f'pve_service_{svc}'
                        if health_persistence.is_error_active(error_key):
                            health_persistence.clear_error(error_key)

                # If all failed services are dismissed, return OK
                if not active_failed:
                    return {
                        'status': 'OK',
                        'reason': None,
                        'failed': [],
                        'is_cluster': is_cluster,
                        'services_checked': len(services_to_check),
                        'checks': checks
                    }

                return {
                    'status': 'CRITICAL',
                    'reason': f'Services inactive: {", ".join(active_failed)}',
                    'failed': active_failed,
                    'is_cluster': is_cluster,
                    'services_checked': len(services_to_check),
                    'checks': checks
                }

            # All OK - clear any previously tracked service errors
            for svc in services_to_check:
                error_key = f'pve_service_{svc}'
                if health_persistence.is_error_active(error_key):
                    health_persistence.clear_error(error_key)

            return {
                'status': 'OK',
                'is_cluster': is_cluster,
                'services_checked': len(services_to_check),
                'checks': checks
            }

        except Exception as e:
            return {
                'status': 'WARNING',
                'reason': f'Service check command failed: {str(e)}'
            }

    def _is_benign_error(self, line: str) -> bool:
        """Check if log line matches benign error patterns (uses pre-compiled regex)"""
        benign_re, _, _ = self._get_compiled_patterns()
        return bool(benign_re.search(line.lower()))

    def _enrich_critical_log_reason(self, line: str) -> str:
        """
        Transform a raw kernel/system log line into a human-readable reason
        for notifications and the health dashboard.
        """
        line_lower = line.lower()

        # EXT4/BTRFS/XFS/ZFS filesystem errors
        if 'ext4-fs error' in line_lower or 'btrfs error' in line_lower or 'xfs' in line_lower and 'error' in line_lower:
            fs_type = 'EXT4' if 'ext4' in line_lower else ('BTRFS' if 'btrfs' in line_lower else 'XFS')
            dev_match = re.search(r'device\s+(\S+?)\)?:', line)
            device = dev_match.group(1).rstrip(')') if dev_match else 'unknown'
            func_match = re.search(r':\s+(\w+):\d+:', line)
            func_name = func_match.group(1) if func_match else ''
            inode_match = re.search(r'inode\s+#?(\d+)', line)
            inode = inode_match.group(1) if inode_match else ''

            # Translate function name
            func_translations = {
                'ext4_find_entry': 'directory lookup failed (possible directory corruption)',
                'ext4_lookup': 'file lookup failed (possible metadata corruption)',
                'ext4_journal_start': 'journal transaction failed (journal corruption)',
                'ext4_readdir': 'directory read failed (directory data corrupted)',
                'ext4_get_inode_loc': 'inode location failed (inode table corruption)',
                '__ext4_get_inode_loc': 'inode location failed (inode table corruption)',
                'ext4_xattr_get': 'extended attributes read failed',
                'ext4_iget': 'inode read failed (possible inode corruption)',
                'ext4_mb_generate_buddy': 'block allocator error',
                'ext4_validate_block_bitmap': 'block bitmap corrupted',
                'ext4_validate_inode_bitmap': 'inode bitmap corrupted',
                'htree_dirblock_to_tree': 'directory index tree corrupted',
            }

            # Identify the device
            device_info = self._identify_block_device(device)

            reason = f'{fs_type} filesystem error on /dev/{device}'
            if device_info:
                reason += f'\nDevice: {device_info}'
            else:
                reason += f'\nDevice: /dev/{device} (not currently detected -- may be a disconnected USB or temporary device)'
            if func_name:
                desc = func_translations.get(func_name, func_name)
                reason += f'\nError: {desc}'
            if inode:
                inode_hint = 'root directory' if inode == '2' else f'inode #{inode}'
                reason += f'\nAffected: {inode_hint}'
            # Note: Action/recommendations are provided by AI when AI Suggestions is enabled
            return reason

        # Out of memory
        if 'out of memory' in line_lower or 'oom_kill' in line_lower:
            m = re.search(r'Killed process\s+\d+\s+\(([^)]+)\)', line)
            process = m.group(1) if m else 'unknown'
            return f'Out of memory - system killed process "{process}" to free RAM'

        # Kernel panic
        if 'kernel panic' in line_lower:
            return 'Kernel panic - system halted. Reboot required.'

        # Segfault
        if 'segfault' in line_lower:
            m = re.search(r'(\S+)\[\d+\].*segfault', line)
            process = m.group(1) if m else 'unknown'
            is_critical_proc = any(p in process.lower() for p in self.PVE_CRITICAL_PROCESSES)
            if is_critical_proc:
                return f'Critical process "{process}" crashed (segmentation fault) -- PVE service affected'
            return f'Process "{process}" crashed (segmentation fault)'

        # Hardware error
        if 'hardware error' in line_lower or 'mce:' in line_lower:
            return f'Hardware error detected (MCE) - check CPU/RAM health'

        # RAID failure
        if 'raid' in line_lower and 'fail' in line_lower:
            md_match = re.search(r'(md\d+)', line)
            md_dev = md_match.group(1) if md_match else 'unknown'
            return f'RAID array {md_dev} degraded or failed - check disk status'

        # Fallback: clean up the raw line
        clean = re.sub(r'^\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\S+\s+', '', line)
        clean = re.sub(r'\[\d+\]:\s*', '', clean)
        return clean[:150]

    def _classify_log_severity(self, line: str) -> Optional[str]:
        """
        Classify log line severity intelligently.
        Returns: 'CRITICAL', 'WARNING', or None (benign/info)

        Design principles:
        - CRITICAL must be reserved for events that require IMMEDIATE action
          (data loss risk, service outage, hardware failure confirmed by SMART).
        - WARNING is for events worth investigating but not urgent.
        - Everything else is None (benign/informational).
        """
        line_lower = line.lower()

        # Check if benign first -- fast path for known noise
        if self._is_benign_error(line):
            return None

        # Check critical keywords (hard failures: OOM, panic, FS corruption, etc.)
        for keyword in self.CRITICAL_LOG_KEYWORDS:
            if re.search(keyword, line_lower):
                return 'CRITICAL'

        # Check warning keywords (includes segfault, I/O errors, etc.)
        for keyword in self.WARNING_LOG_KEYWORDS:
            if re.search(keyword, line_lower):
                # Special case: segfault of a PVE-critical process is CRITICAL
                if 'segfault' in line_lower:
                    for proc in self.PVE_CRITICAL_PROCESSES:
                        if proc in line_lower:
                            return 'CRITICAL'
                return 'WARNING'

        # Generic classification -- very conservative to avoid false positives.
        # Only escalate if the line explicitly uses severity-level keywords
        # from the kernel or systemd (not just any line containing "error").
        if 'kernel panic' in line_lower or ('fatal' in line_lower and 'non-fatal' not in line_lower):
            return 'CRITICAL'

        # Lines from priority "err" that don't match any keyword above are
        # likely informational noise (e.g. "error response from daemon").
        # Return None to avoid flooding the dashboard with non-actionable items.
        return None

    def _check_logs_with_persistence(self) -> Dict[str, Any]:
        """
        Intelligent log checking with cascade detection and persistence.
        Focuses on detecting significant error patterns rather than transient warnings.

        New thresholds:
        - CASCADE: ≥15 errors (increased from 10)
        - SPIKE: ≥5 errors AND 4x increase (more restrictive)
        - PERSISTENT: Same error in 3 consecutive checks
        """
        cache_key = 'logs_analysis'
        current_time = time.time()

        # Cache the result for 5 minutes to avoid excessive journalctl calls
        if cache_key in self.last_check_times:
            if current_time - self.last_check_times[cache_key] < self.LOG_CHECK_INTERVAL:
                # Return the full cached result (which includes 'checks' dict)
                cached = self.cached_results.get(cache_key)
                if cached:
                    return cached
                return {'status': 'OK', 'checks': {
                    'log_error_cascade': {'status': 'OK', 'detail': 'No cascading errors'},
                    'log_error_spike': {'status': 'OK', 'detail': 'No error spikes'},
                    'log_persistent_errors': {'status': 'OK', 'detail': 'No persistent patterns'},
                    'log_critical_errors': {'status': 'OK', 'detail': 'No critical errors'}
                }}

        try:
            # Fetch logs from the last 3 minutes for immediate issue detection
            # Use -b 0 to only include logs from the CURRENT boot (not previous boots)
            # This prevents OOM/crash errors from before a reboot from persisting
            result_recent = subprocess.run(
                ['journalctl', '-b', '0', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
                timeout=20
            )

            # Fetch logs from the previous 3-minute interval to detect spikes/cascades
            # Also limited to current boot only
            result_previous = subprocess.run(
                ['journalctl', '-b', '0', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
                timeout=20
            )

            if result_recent.returncode == 0:
                recent_lines = result_recent.stdout.strip().split('\n')
                previous_lines = result_previous.stdout.strip().split('\n') if result_previous.returncode == 0 else []

                recent_patterns = defaultdict(int)
                previous_patterns = defaultdict(int)
                critical_errors_found = {} # To store unique critical error lines for persistence

                for line in recent_lines:
                    if not line.strip():
                        continue

                    # Skip benign errors
                    if self._is_benign_error(line):
                        continue

                    # Classify severity
                    severity = self._classify_log_severity(line)

                    if severity is None: # Skip informational or classified benign lines
                        continue

                    # Normalize to a pattern for grouping
                    pattern = self._normalize_log_pattern(line)

                    if severity == 'CRITICAL':
                        pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
                        error_key = f'log_critical_{pattern_hash}'

                        # ── SMART cross-reference for disk/FS errors ──
                        # Filesystem and disk errors are only truly CRITICAL if
                        # the underlying disk is actually failing.  We check:
                        #  1. Device exists? No -> WARNING (disconnected USB, etc.)
                        #  2. SMART PASSED? -> WARNING (transient error, not disk failure)
                        #  3. SMART FAILED? -> CRITICAL (confirmed hardware problem)
                        #  4. SMART UNKNOWN? -> WARNING (can't confirm, err on side of caution)
                        fs_dev_match = re.search(
                            r'(?:ext4-fs|btrfs|xfs|zfs)\s+error.*?device\s+(\S+?)\)?[:\s]',
                            line, re.IGNORECASE
                        )
                        smart_status_for_log = None
                        if fs_dev_match:
                            fs_dev = fs_dev_match.group(1).rstrip(')')
                            base_dev = re.sub(r'\d+$', '', fs_dev)
                            if not os.path.exists(f'/dev/{base_dev}'):
                                # Device not present -- almost certainly a disconnected drive
                                severity = 'WARNING'
                                smart_status_for_log = 'DEVICE_ABSENT'
                            elif self.capabilities.get('has_smart'):
                                smart_health = self._quick_smart_health(base_dev)
                                smart_status_for_log = smart_health
                                if smart_health == 'PASSED':
                                    # SMART says disk is healthy -- transient FS error
                                    severity = 'WARNING'
                                elif smart_health == 'UNKNOWN':
                                    # Can't verify -- be conservative, don't alarm
                                    severity = 'WARNING'
                                # smart_health == 'FAILED' -> keep CRITICAL

                        if pattern not in critical_errors_found:
                            # Only count as "critical" if severity wasn't downgraded
                            if severity == 'CRITICAL':
                                critical_errors_found[pattern] = line
                            # Build a human-readable reason from the raw log line
                            enriched_reason = self._enrich_critical_log_reason(line)

                            # Append SMART context to the reason if we checked it
                            if smart_status_for_log == 'PASSED':
                                enriched_reason += '\nSMART: Passed (disk is healthy -- error is likely transient)'
                            elif smart_status_for_log == 'FAILED':
                                enriched_reason += '\nSMART: FAILED -- disk is failing, replace immediately'
                            elif smart_status_for_log == 'DEVICE_ABSENT':
                                enriched_reason += '\nDevice not currently detected -- may be a disconnected USB or temporary device'

                            # Record persistent error if it's not already active
                            if not health_persistence.is_error_active(error_key, category='logs'):
                                health_persistence.record_error(
                                    error_key=error_key,
                                    category='logs',
                                    severity=severity,
                                    reason=enriched_reason,
                                    details={'pattern': pattern, 'raw_line': line[:200],
                                             'smart_status': smart_status_for_log,
                                             'dismissable': True}
                                )

                            # Cross-reference: filesystem errors also belong in the disks category
                            # so they appear in the Storage/Disks dashboard section
                            fs_match = re.search(r'(?:ext4-fs|btrfs|xfs|zfs)\s+error.*?(?:device\s+(\S+?)\)?[:\s])', line, re.IGNORECASE)
                            if fs_match:
                                fs_device = fs_match.group(1).rstrip(')') if fs_match.group(1) else 'unknown'
                                # Strip partition number to get base disk (sdb1 -> sdb)
                                base_device = re.sub(r'\d+$', '', fs_device) if not ('nvme' in fs_device or 'mmcblk' in fs_device) else fs_device.rsplit('p', 1)[0] if 'p' in fs_device else fs_device
                                disk_error_key = f'disk_fs_{fs_device}'

                                # Use the SMART-aware severity we already determined above
                                device_exists = os.path.exists(f'/dev/{base_device}')
                                if not device_exists:
                                    # Device no longer exists (USB disconnected, removed disk)
                                    # Skip creating error - it's a stale journal entry
                                    continue
                                elif smart_status_for_log == 'PASSED':
                                    fs_severity = 'WARNING'  # SMART healthy -> transient
                                elif smart_status_for_log == 'FAILED':
                                    fs_severity = 'CRITICAL'  # SMART failing -> real problem
                                else:
                                    fs_severity = 'WARNING'  # Can't confirm -> conservative

                                if not health_persistence.is_error_active(disk_error_key, category='disks'):
                                    health_persistence.record_error(
                                        error_key=disk_error_key,
                                        category='disks',
                                        severity=fs_severity,
                                        reason=enriched_reason,
                                        details={
                                            'disk': base_device,
                                            'device': f'/dev/{fs_device}',
                                            'block_device': base_device,
                                            'error_type': 'filesystem',
                                            'error_count': 1,
                                            'sample': line[:200],
                                            'smart_status': smart_status_for_log,
                                            'dismissable': True,
                                            'device_exists': True,  # Always true here (non-existent devices skip above)
                                        }
                                    )

                                # Record filesystem error as permanent disk observation
                                try:
                                    obs_serial = None
                                    try:
                                        sm = subprocess.run(
                                            ['smartctl', '-i', f'/dev/{base_device}'],
                                            capture_output=True, text=True, timeout=3)
                                        if sm.returncode in (0, 4):
                                            for sline in sm.stdout.split('\n'):
                                                if 'Serial Number' in sline or 'Serial number' in sline:
                                                    obs_serial = sline.split(':')[-1].strip()
                                                    break
                                    except Exception:
                                        pass
                                    health_persistence.record_disk_observation(
                                        device_name=base_device,
                                        serial=obs_serial,
                                        error_type='filesystem_error',
                                        error_signature=f'fs_error_{fs_device}_{pattern_hash}',
                                        raw_message=enriched_reason[:500],
                                        severity=fs_severity.lower(),
                                    )
                                except Exception:
                                    pass

                    recent_patterns[pattern] += 1

                    if pattern in self.persistent_log_patterns:
                        self.persistent_log_patterns[pattern]['count'] += 1
                        self.persistent_log_patterns[pattern]['last_seen'] = current_time
                    else:
                        self.persistent_log_patterns[pattern] = {
                            'count': 1,
                            'first_seen': current_time,
                            'last_seen': current_time,
                            'sample': line.strip()[:200],  # Original line for display
                        }

                for line in previous_lines:
                    if not line.strip():
                        continue

                    # Skip benign errors
                    if self._is_benign_error(line):
                        continue

                    # Classify severity
                    severity = self._classify_log_severity(line)

                    if severity is None: # Skip informational or classified benign lines
                        continue

                    # Normalize to a pattern for grouping
                    pattern = self._normalize_log_pattern(line)
                    previous_patterns[pattern] += 1

                cascading_errors = {
                    pattern: count for pattern, count in recent_patterns.items()
                    if count >= 15 and self._classify_log_severity(pattern) in ['WARNING', 'CRITICAL']
                }

                spike_errors = {}
                for pattern, recent_count in recent_patterns.items():
                    prev_count = previous_patterns.get(pattern, 0)
                    if recent_count >= 5 and recent_count >= prev_count * 4:
                        spike_errors[pattern] = recent_count

                # Helper: get human-readable samples from normalized patterns
                def _get_samples(error_dict, max_items=3):
                    """Return list of readable sample lines for error patterns."""
                    samples = []
                    for pattern in list(error_dict.keys())[:max_items]:
                        pdata = self.persistent_log_patterns.get(pattern, {})
                        sample = pdata.get('sample', pattern)
                        # Trim timestamp prefix if present (e.g. "Feb 27 16:03:35 host ")
                        clean = re.sub(r'^[A-Z][a-z]{2}\s+\d+\s+[\d:]+\s+\S+\s+', '', sample)
                        samples.append(clean[:120])
                    return samples

                persistent_errors = {}
                for pattern, data in self.persistent_log_patterns.items():
                    time_span = current_time - data['first_seen']
                    if data['count'] >= 3 and time_span >= 900:  # 15 minutes
                        persistent_errors[pattern] = data['count']

                        # Record as warning if not already recorded
                        pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
                        error_key = f'log_persistent_{pattern_hash}'
                        if not health_persistence.is_error_active(error_key, category='logs'):
                            # Use the original sample line for the notification,
                            # not the normalized pattern (which has IDs replaced).
                            sample = data.get('sample', pattern)
                            # Strip journal timestamp prefix so the stored reason
                            # doesn't contain dated information that confuses
                            # re-notifications.
                            clean_sample = re.sub(
                                r'^[A-Z][a-z]{2}\s+\d+\s+[\d:]+\s+\S+\s+', '', sample
                            )
                            health_persistence.record_error(
                                error_key=error_key,
                                category='logs',
                                severity='WARNING',
                                reason=f'Recurring error ({data["count"]}x): {clean_sample[:150]}',
                                details={'pattern': pattern, 'sample': sample,
                                         'dismissable': True, 'occurrences': data['count']}
                            )

                patterns_to_remove = [
                    p for p, data in self.persistent_log_patterns.items()
                    if current_time - data['last_seen'] > 1800
                ]
                for pattern in patterns_to_remove:
                    del self.persistent_log_patterns[pattern]

                # B5 fix: Cap size to prevent unbounded memory growth under high error load
                MAX_LOG_PATTERNS = 500
                if len(self.persistent_log_patterns) > MAX_LOG_PATTERNS:
                    sorted_patterns = sorted(
                        self.persistent_log_patterns.items(),
                        key=lambda x: x[1]['last_seen'],
                        reverse=True
                    )
                    self.persistent_log_patterns = defaultdict(
                        lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0},
                        dict(sorted_patterns[:MAX_LOG_PATTERNS])
                    )

                unique_critical_count = len(critical_errors_found)
                cascade_count = len(cascading_errors)
                spike_count = len(spike_errors)
                persistent_count = len(persistent_errors)

                if unique_critical_count > 0:
                    status = 'CRITICAL'
                    # Use enriched reason from the first critical error for the summary
                    representative_line = next(iter(critical_errors_found.values()))
                    enriched = self._enrich_critical_log_reason(representative_line)
                    if unique_critical_count == 1:
                        reason = enriched
                    else:
                        reason = f'{unique_critical_count} critical error(s):\n{enriched}'
                elif cascade_count > 0:
                    status = 'WARNING'
                    samples = _get_samples(cascading_errors, 3)
                    reason = f'Error cascade ({cascade_count} patterns repeating):\n' + '\n'.join(f'  - {s}' for s in samples)
                elif spike_count > 0:
                    status = 'WARNING'
                    samples = _get_samples(spike_errors, 3)
                    reason = f'Error spike ({spike_count} patterns with 4x increase):\n' + '\n'.join(f'  - {s}' for s in samples)
                elif persistent_count > 0:
                    status = 'WARNING'
                    samples = _get_samples(persistent_errors, 3)
                    reason = f'Persistent errors ({persistent_count} patterns over 15+ min):\n' + '\n'.join(f'  - {s}' for s in samples)
                else:
                    # No significant issues found
                    status = 'OK'
                    reason = None

                # Record/clear persistent errors for each log sub-check so Dismiss works
                cascade_samples = _get_samples(cascading_errors, 2) if cascade_count else []
                spike_samples = _get_samples(spike_errors, 2) if spike_count else []
                persist_samples = _get_samples(persistent_errors, 2) if persistent_count else []

                log_sub_checks = {
                    'log_error_cascade': {'active': cascade_count > 0, 'severity': 'WARNING',
                        'reason': f'{cascade_count} pattern(s) repeating >=15 times:\n' + '\n'.join(f'  - {s}' for s in cascade_samples) if cascade_count else ''},
                    'log_error_spike': {'active': spike_count > 0, 'severity': 'WARNING',
                        'reason': f'{spike_count} pattern(s) with 4x increase:\n' + '\n'.join(f'  - {s}' for s in spike_samples) if spike_count else ''},
                    'log_persistent_errors': {'active': persistent_count > 0, 'severity': 'WARNING',
                        'reason': f'{persistent_count} recurring pattern(s) over 15+ min:\n' + '\n'.join(f'  - {s}' for s in persist_samples) if persistent_count else ''},
                    'log_critical_errors': {'active': unique_critical_count > 0, 'severity': 'CRITICAL',
                        'reason': f'{unique_critical_count} critical error(s) found', 'dismissable': False},
                }

                # Track which sub-checks were dismissed
                dismissed_keys = set()
                for err_key, info in log_sub_checks.items():
                    if info['active']:
                        is_dismissable = info.get('dismissable', True)
                        result = health_persistence.record_error(
                            error_key=err_key,
                            category='logs',
                            severity=info['severity'],
                            reason=info['reason'],
                            details={'dismissable': is_dismissable}
                        )
                        if result and result.get('type') == 'skipped_acknowledged':
                            dismissed_keys.add(err_key)
                    elif health_persistence.is_error_active(err_key):
                        health_persistence.clear_error(err_key)

                # Build checks dict - downgrade dismissed items to INFO
                def _log_check_status(key, active, severity):
                    if not active:
                        return 'OK'
                    if key in dismissed_keys:
                        return 'INFO'
                    return severity

                # Build detail strings that include the actual error samples
                # so the user can see exactly WHAT is triggering the warning.
                if cascade_count > 0:
                    cascade_detail = f'{cascade_count} pattern(s) repeating >=15 times: ' + '; '.join(cascade_samples)
                else:
                    cascade_detail = 'No cascading errors'

                if spike_count > 0:
                    spike_detail = f'{spike_count} pattern(s) with 4x increase: ' + '; '.join(spike_samples)
                else:
                    spike_detail = 'No error spikes'

                if persistent_count > 0:
                    persist_detail = f'{persistent_count} recurring pattern(s) over 15+ min: ' + '; '.join(persist_samples)
                else:
                    persist_detail = 'No persistent patterns'

                log_checks = {
                    'log_error_cascade': {
                        'status': _log_check_status('log_error_cascade', cascade_count > 0, 'WARNING'),
                        'detail': cascade_detail,
                        'dismissable': True,
                        'dismissed': 'log_error_cascade' in dismissed_keys,
                        'error_key': 'log_error_cascade'
                    },
                    'log_error_spike': {
                        'status': _log_check_status('log_error_spike', spike_count > 0, 'WARNING'),
                        'detail': spike_detail,
                        'dismissable': True,
                        'dismissed': 'log_error_spike' in dismissed_keys,
                        'error_key': 'log_error_spike'
                    },
                    'log_persistent_errors': {
                        'status': _log_check_status('log_persistent_errors', persistent_count > 0, 'WARNING'),
                        'detail': persist_detail,
                        'dismissable': True,
                        'dismissed': 'log_persistent_errors' in dismissed_keys,
                        'error_key': 'log_persistent_errors'
                    },
                    'log_critical_errors': {
                        'status': _log_check_status('log_critical_errors', unique_critical_count > 0, 'CRITICAL'),
                        'detail': reason if unique_critical_count > 0 else 'No critical errors',
                        'dismissable': False,
                        'error_key': 'log_critical_errors'
                    }
                }

                # Recalculate overall status considering dismissed items
                active_issues = {k: v for k, v in log_checks.items() if v['status'] in ('WARNING', 'CRITICAL')}
                if not active_issues:
                    status = 'OK'
                    reason = None
                else:
                    # Recalculate status and reason from only non-dismissed sub-checks
                    has_critical = any(v['status'] == 'CRITICAL' for v in active_issues.values())
                    status = 'CRITICAL' if has_critical else 'WARNING'
                    # Rebuild reason from active (non-dismissed) checks only
                    active_reasons = []
                    for k, v in active_issues.items():
                        detail = v.get('detail', '')
                        if detail:
                            active_reasons.append(detail)
                    reason = '; '.join(active_reasons[:3]) if active_reasons else None

                log_result = {'status': status, 'checks': log_checks}
                if reason:
                    log_result['reason'] = reason

                self.cached_results[cache_key] = log_result
                self.last_check_times[cache_key] = current_time
                return log_result

            # If journalctl command failed or returned no data
            ok_result = {'status': 'OK', 'checks': {
                'log_error_cascade': {'status': 'OK', 'detail': 'No cascading errors'},
                'log_error_spike': {'status': 'OK', 'detail': 'No error spikes'},
                'log_persistent_errors': {'status': 'OK', 'detail': 'No persistent patterns'},
                'log_critical_errors': {'status': 'OK', 'detail': 'No critical errors'}
            }}
            self.cached_results[cache_key] = ok_result
            self.last_check_times[cache_key] = current_time
            return ok_result

        except Exception as e:
            print(f"[HealthMonitor] Log check failed: {e}")
            return {'status': 'UNKNOWN', 'reason': f'Log check unavailable: {str(e)}', 'checks': {}, 'dismissable': True}

    def _normalize_log_pattern(self, line: str) -> str:
        """
        Normalize log line to a pattern for grouping similar errors.
        Removes timestamps, PIDs, IDs, paths, and other variables.
        """
        # Remove standard syslog timestamp and process info if present
        pattern = re.sub(r'^\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\S+(\s+\[\d+\])?:\s+', '', line)

        pattern = re.sub(r'\d{4}-\d{2}-\d{2}', '', pattern)  # Remove dates
        pattern = re.sub(r'\d{2}:\d{2}:\d{2}', '', pattern)  # Remove times
        pattern = re.sub(r'pid[:\s]+\d+', 'pid:XXX', pattern.lower())  # Normalize PIDs
        pattern = re.sub(r'\b\d{3,6}\b', 'ID', pattern)  # Normalize IDs (common for container/VM IDs)
        pattern = re.sub(r'/dev/\S+', '/dev/XXX', pattern)  # Normalize device paths
        pattern = re.sub(r'/\S+/\S+', '/PATH/', pattern)  # Normalize general paths
        pattern = re.sub(r'0x[0-9a-f]+', '0xXXX', pattern)  # Normalize hex values
        pattern = re.sub(r'\b(uuid|guid|hash)[:=]\s*[\w-]+\b', r'\1=XXX', pattern.lower()) # Normalize UUIDs/GUIDs
        pattern = re.sub(r'\s+', ' ', pattern).strip()  # Normalize whitespace

        return pattern[:150]  # Keep first 150 characters to avoid overly long patterns

    # Regex to parse Inst lines: Inst <pkg> [<cur>] (<new> <repo> [<arch>])
    _RE_INST = re.compile(r'^Inst\s+(\S+)\s+\[([^\]]+)\]\s+\((\S+)\s+')
    _RE_INST_NEW = re.compile(r'^Inst\s+(\S+)\s+\((\S+)\s+')

    _PVE_PREFIXES = (
        'pve-', 'proxmox-', 'qemu-server', 'lxc-pve', 'ceph',
        'corosync', 'libpve', 'pbs-', 'pmg-',
    )
    _KERNEL_PREFIXES = ('linux-image', 'pve-kernel', 'pve-firmware')
    _IMPORTANT_PKGS = {
        'pve-manager', 'proxmox-ve', 'qemu-server', 'pve-container',
        'pve-ha-manager', 'pve-firewall', 'ceph-common',
        'proxmox-backup-client',
    }

    def _check_updates(self) -> Optional[Dict[str, Any]]:
        """
        Check for pending system updates.
        - INFO: Any updates available (including security updates).
        - WARNING: Security updates pending 360+ days unpatched, or system not updated >1 year (365 days).
        - CRITICAL: System not updated >18 months (548 days).

        Updates are always informational unless they represent a prolonged
        unpatched state.  Detects PVE version upgrades from pve-manager
        Inst lines and exposes them as an INFO sub-check.
        """
        cache_key = 'updates_check'
        current_time = time.time()

        # Cache for 10 minutes
        if cache_key in self.last_check_times:
            if current_time - self.last_check_times[cache_key] < 600:
                return self.cached_results.get(cache_key)

        try:
            apt_history_path = '/var/log/apt/history.log'
            last_update_days = None
            sec_result = None
            age_result = None

            if os.path.exists(apt_history_path):
                try:
                    mtime = os.path.getmtime(apt_history_path)
                    days_since_update = (current_time - mtime) / 86400
                    last_update_days = int(days_since_update)
                except Exception:
                    pass

            # Perform a dry run of apt-get upgrade to see pending packages
            try:
                result = subprocess.run(
                    ['apt-get', 'upgrade', '--dry-run'],
                    capture_output=True, text=True, timeout=30
                )
            except subprocess.TimeoutExpired:
                print("[HealthMonitor] apt-get upgrade --dry-run timed out")
                return {
                    'status': 'UNKNOWN',
                    'reason': 'apt-get timed out - repository may be unreachable',
                    'count': 0, 'checks': {}
                }

            status = 'OK'
            reason = None
            update_count = 0
            security_pkgs: list = []
            kernel_pkgs: list = []
            pve_pkgs: list = []
            important_pkgs: list = []   # {name, cur, new}
            pve_manager_info = None     # {cur, new} or None
            sec_result = None
            sec_severity = 'INFO'
            sec_days_unpatched = 0

            if result.returncode == 0:
                for line in result.stdout.strip().split('\n'):
                    if not line.startswith('Inst '):
                        continue
                    update_count += 1

                    # Parse package name, current and new versions
                    m = self._RE_INST.match(line)
                    if m:
                        pkg_name, cur_ver, new_ver = m.group(1), m.group(2), m.group(3)
                    else:
                        m2 = self._RE_INST_NEW.match(line)
                        if m2:
                            pkg_name, cur_ver, new_ver = m2.group(1), '', m2.group(2)
                        else:
                            parts = line.split()
                            pkg_name = parts[1] if len(parts) > 1 else 'unknown'
                            cur_ver, new_ver = '', ''

                    # Strip arch suffix (e.g. package:amd64)
                    pkg_name = pkg_name.split(':')[0]
                    name_lower = pkg_name.lower()
                    line_lower = line.lower()

                    # Categorise
                    if 'security' in line_lower or 'debian-security' in line_lower:
                        security_pkgs.append(pkg_name)

                    if any(name_lower.startswith(p) for p in self._KERNEL_PREFIXES):
                        kernel_pkgs.append(pkg_name)
                    elif any(name_lower.startswith(p) for p in self._PVE_PREFIXES):
                        pve_pkgs.append(pkg_name)

                    # Collect important packages with version info
                    if pkg_name in self._IMPORTANT_PKGS and cur_ver:
                        important_pkgs.append({
                            'name': pkg_name, 'cur': cur_ver, 'new': new_ver
                        })

                    # Detect pve-manager upgrade -> PVE version upgrade
                    if pkg_name == 'pve-manager' and cur_ver and new_ver:
                        pve_manager_info = {'cur': cur_ver, 'new': new_ver}

                # ── Determine overall status ──────────────────────
                if security_pkgs:
                    sec_days_unpatched = 0
                    try:
                        existing = health_persistence.get_error_by_key('security_updates')
                        if existing and existing.get('first_seen'):
                            from datetime import datetime
                            first_dt = datetime.fromisoformat(existing['first_seen'])
                            sec_days_unpatched = (datetime.now() - first_dt).days
                    except Exception:
                        pass

                    if sec_days_unpatched >= self.SECURITY_WARN_DAYS:
                        status = 'WARNING'
                        reason = f'{len(security_pkgs)} security update(s) pending for {sec_days_unpatched} days'
                        sec_severity = 'WARNING'
                    else:
                        status = 'INFO'
                        reason = f'{len(security_pkgs)} security update(s) pending'
                        sec_severity = 'INFO'

                    sec_result = health_persistence.record_error(
                        error_key='security_updates',
                        category='updates',
                        severity=sec_severity,
                        reason=reason,
                        details={'count': len(security_pkgs), 'packages': security_pkgs[:5],
                                 'dismissable': sec_severity == 'WARNING',
                                 'days_unpatched': sec_days_unpatched}
                    )
                    if sec_result and sec_result.get('type') == 'skipped_acknowledged':
                        status = 'INFO'
                        reason = None

                elif last_update_days and last_update_days >= 548:
                    status = 'CRITICAL'
                    reason = f'System not updated in {last_update_days} days (>18 months)'
                    health_persistence.record_error(
                        error_key='system_age', category='updates',
                        severity='CRITICAL', reason=reason,
                        details={'days': last_update_days, 'update_count': update_count, 'dismissable': False}
                    )
                elif last_update_days and last_update_days >= 365:
                    status = 'WARNING'
                    reason = f'System not updated in {last_update_days} days (>1 year)'
                    age_result = health_persistence.record_error(
                        error_key='system_age', category='updates',
                        severity='WARNING', reason=reason,
                        details={'days': last_update_days, 'update_count': update_count, 'dismissable': True}
                    )
                    if age_result and age_result.get('type') == 'skipped_acknowledged':
                        status = 'INFO'
                        reason = None
                elif kernel_pkgs or pve_pkgs:
                    status = 'INFO'
                    reason = f'{len(kernel_pkgs)} kernel + {len(pve_pkgs)} Proxmox update(s) available'
                elif update_count > 0:
                    status = 'INFO'
                    reason = f'{update_count} package update(s) pending'

            elif result.returncode != 0:
                status = 'WARNING'
                reason = 'Failed to check for updates (apt-get error)'

            # ── Build checks dict ─────────────────────────────────
            age_dismissed = bool(age_result and age_result.get('type') == 'skipped_acknowledged')
            update_age_status = 'CRITICAL' if (last_update_days and last_update_days >= 548) else (
                'INFO' if age_dismissed else ('WARNING' if (last_update_days and last_update_days >= 365) else 'OK'))

            sec_dismissed = security_pkgs and sec_result and sec_result.get('type') == 'skipped_acknowledged'
            if sec_dismissed:
                sec_status = 'INFO'
            elif security_pkgs:
                sec_status = sec_severity
            else:
                sec_status = 'OK'

            sec_detail = f'{len(security_pkgs)} security update(s) pending'
            if security_pkgs and sec_days_unpatched >= self.SECURITY_WARN_DAYS:
                sec_detail += f' ({sec_days_unpatched} days unpatched)'

            checks = {
                'kernel_pve': {
                    'status': 'INFO' if kernel_pkgs else 'OK',
                    'detail': f'{len(kernel_pkgs)} kernel/PVE update(s)' if kernel_pkgs else 'Kernel/PVE up to date',
                    'error_key': 'kernel_pve'
                },
                'pending_updates': {
                    'status': 'INFO' if update_count > 0 else 'OK',
                    'detail': f'{update_count} package(s) pending',
                    'error_key': 'pending_updates'
                },
                'security_updates': {
                    'status': sec_status,
                    'detail': sec_detail if security_pkgs else 'No security updates pending',
                    'dismissable': sec_status == 'WARNING' and not sec_dismissed,
                    'dismissed': bool(sec_dismissed),
                    'error_key': 'security_updates'
                },
                'system_age': {
                    'status': update_age_status,
                    'detail': f'Last updated {last_update_days} day(s) ago' if last_update_days is not None else 'Unknown',
                    'dismissable': update_age_status == 'WARNING' and not age_dismissed,
                    'dismissed': bool(age_dismissed),
                    'error_key': 'system_age'
                },
            }

            # PVE version sub-check (always INFO)
            if pve_manager_info:
                checks['pve_version'] = {
                    'status': 'INFO',
                    'detail': f"PVE {pve_manager_info['cur']} -> {pve_manager_info['new']} available",
                    'error_key': 'pve_version'
                }
            else:
                checks['pve_version'] = {
                    'status': 'OK',
                    'detail': 'Proxmox VE is up to date',
                    'error_key': 'pve_version'
                }

            # Construct result dictionary
            update_result = {
                'status': status,
                'count': update_count,
                'checks': checks,
            }
            if reason:
                update_result['reason'] = reason
            if last_update_days is not None:
                update_result['days_since_update'] = last_update_days
            # Attach categorised counts for the frontend
            update_result['security_count'] = len(security_pkgs)
            update_result['pve_count'] = len(pve_pkgs)
            update_result['kernel_count'] = len(kernel_pkgs)
            update_result['important_packages'] = important_pkgs[:8]

            self.cached_results[cache_key] = update_result
            self.last_check_times[cache_key] = current_time
            return update_result

        except Exception as e:
            print(f"[HealthMonitor] Updates check failed: {e}")
            return {'status': 'UNKNOWN', 'reason': f'Updates check unavailable: {str(e)}', 'count': 0, 'checks': {}, 'dismissable': True}

    def _check_fail2ban_bans(self) -> Dict[str, Any]:
        """
        Check if fail2ban is installed and if there are currently banned IPs.
        Cached for 60 seconds to avoid hammering fail2ban-client.

        Returns:
          {'installed': bool, 'active': bool, 'status': str, 'detail': str,
           'banned_count': int, 'jails': [...], 'banned_ips': [...]}
        """
        cache_key = 'fail2ban_bans'
        current_time = time.time()

        if cache_key in self.last_check_times:
            if current_time - self.last_check_times[cache_key] < 60:
                return self.cached_results.get(cache_key, {'installed': False, 'status': 'OK', 'detail': 'Not installed'})

        result = {'installed': False, 'active': False, 'status': 'OK', 'detail': 'Not installed', 'banned_count': 0, 'jails': [], 'banned_ips': []}

        try:
            # Check if fail2ban-client exists
            which_result = subprocess.run(
                ['which', 'fail2ban-client'],
                capture_output=True, text=True, timeout=2
            )
            if which_result.returncode != 0:
                self.cached_results[cache_key] = result
                self.last_check_times[cache_key] = current_time
                return result

            result['installed'] = True

            # Check if fail2ban service is active
            active_check = subprocess.run(
                ['systemctl', 'is-active', 'fail2ban'],
                capture_output=True, text=True, timeout=2
            )
            if active_check.stdout.strip() != 'active':
                result['detail'] = 'Fail2Ban installed but service not active'
                self.cached_results[cache_key] = result
                self.last_check_times[cache_key] = current_time
                return result

            result['active'] = True

            # Get list of active jails
            jails_result = subprocess.run(
                ['fail2ban-client', 'status'],
                capture_output=True, text=True, timeout=3
            )

            jails = []
            if jails_result.returncode == 0:
                for line in jails_result.stdout.split('\n'):
                    if 'Jail list:' in line:
                        jail_str = line.split('Jail list:')[1].strip()
                        jails = [j.strip() for j in jail_str.split(',') if j.strip()]
                        break

            if not jails:
                result['detail'] = 'Fail2Ban active, no jails configured'
                self.cached_results[cache_key] = result
                self.last_check_times[cache_key] = current_time
                return result

            result['jails'] = jails

            # Check each jail for banned IPs
            total_banned = 0
            all_banned_ips = []
            jails_with_bans = []

            for jail in jails:
                try:
                    jail_result = subprocess.run(
                        ['fail2ban-client', 'status', jail],
                        capture_output=True, text=True, timeout=2
                    )
                    if jail_result.returncode == 0:
                        for line in jail_result.stdout.split('\n'):
                            if 'Currently banned:' in line:
                                try:
                                    count = int(line.split('Currently banned:')[1].strip())
                                    if count > 0:
                                        total_banned += count
                                        jails_with_bans.append(jail)
                                except (ValueError, IndexError):
                                    pass
                            elif 'Banned IP list:' in line:
                                ips_str = line.split('Banned IP list:')[1].strip()
                                if ips_str:
                                    ips = [ip.strip() for ip in ips_str.split() if ip.strip()]
                                    all_banned_ips.extend(ips[:10])  # Limit to 10 IPs per jail
                except Exception:
                    pass

            result['banned_count'] = total_banned
            result['banned_ips'] = all_banned_ips[:20]  # Max 20 total

            if total_banned > 0:
                jails_str = ', '.join(jails_with_bans)
                msg = f'{total_banned} IP(s) currently banned by Fail2Ban (jails: {jails_str})'
                result['status'] = 'WARNING'
                result['detail'] = msg
                # Persistence handled by _check_security caller via security_fail2ban key
            else:
                result['detail'] = f'Fail2Ban active ({len(jails)} jail(s), no current bans)'
                # Auto-resolve if previously banned IPs are now gone
                if health_persistence.is_error_active('fail2ban'):
                    health_persistence.clear_error('fail2ban')

        except Exception as e:
            result['detail'] = f'Unable to check Fail2Ban: {str(e)[:50]}'

        self.cached_results[cache_key] = result
        self.last_check_times[cache_key] = current_time
        return result

    def _check_security(self) -> Dict[str, Any]:
        """
        Check security-related items with detailed sub-item breakdown:
        - Uptime check: >1 year without kernel update indicates vulnerability
        - SSL certificates: PVE certificate expiration
        - Login attempts: Excessive failed logins (brute force detection)
        - Fail2Ban: Currently banned IPs (if fail2ban is installed)

        Returns a result with 'checks' dict containing per-item status.
        """
        try:
            issues = []
            checks = {
            'uptime': {'status': 'OK', 'detail': ''},
            'certificates': {'status': 'OK', 'detail': ''},
            'login_attempts': {'status': 'OK', 'detail': ''},
            }

            # Sub-check 1: Uptime for potential kernel vulnerabilities
            try:
                uptime_seconds = time.time() - psutil.boot_time()
                uptime_days = uptime_seconds / 86400

                if uptime_days > 365:
                    updates_data = self.cached_results.get('updates_check')
                    if updates_data and updates_data.get('days_since_update', 9999) > 365:
                        msg = f'Uptime {int(uptime_days)} days (>1 year, consider updating kernel/system)'
                        issues.append(msg)
                        checks['uptime'] = {'status': 'WARNING', 'detail': msg, 'days': int(uptime_days), 'dismissable': True}
                    else:
                        checks['uptime'] = {'status': 'OK', 'detail': f'Uptime {int(uptime_days)} days, system recently updated'}
                else:
                    checks['uptime'] = {'status': 'OK', 'detail': f'Uptime {int(uptime_days)} days'}
            except Exception:
                checks['uptime'] = {'status': 'OK', 'detail': 'Unable to determine uptime'}

            # Sub-check 2: SSL certificates
            cert_status = self._check_certificates()
            if cert_status:
                cert_sev = cert_status.get('status', 'OK')
                cert_reason = cert_status.get('reason', '')
                checks['certificates'] = {
                    'status': cert_sev,
                    'detail': cert_reason if cert_reason else 'Certificate valid',
                    'dismissable': True if cert_sev not in ['OK', 'INFO'] else False
                }
                if cert_sev not in ['OK', 'INFO']:
                    issues.append(cert_reason or 'Certificate issue')

            # Sub-check 3: Failed login attempts (brute force detection)
            # Cached for 1 hour to avoid reading 24h of logs every 5 minutes
            try:
                current_time = time.time()

                # Check if we have a valid cached result
                if self._journalctl_24h_cache['time'] > 0 and \
                   current_time - self._journalctl_24h_cache['time'] < self._JOURNALCTL_24H_CACHE_TTL:
                    failed_logins = self._journalctl_24h_cache['count']
                else:
                    # Cache expired or first run - read full 24h logs
                    result = subprocess.run(
                        ['journalctl', '--since', '24 hours ago', '--no-pager',
                         '-g', 'authentication failure|failed password|invalid user',
                         '--output=cat', '-n', '5000'],
                        capture_output=True,
                        text=True,
                        timeout=20
                    )

                    failed_logins = 0
                    if result.returncode == 0:
                        for line in result.stdout.split('\n'):
                            line_lower = line.lower()
                            if 'authentication failure' in line_lower or 'failed password' in line_lower or 'invalid user' in line_lower:
                                failed_logins += 1

                    # Cache the result
                    self._journalctl_24h_cache = {'count': failed_logins, 'time': current_time}

                if failed_logins > 50:
                    msg = f'{failed_logins} failed login attempts in 24h'
                    issues.append(msg)
                    checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins, 'dismissable': True}
                elif failed_logins > 0:
                    checks['login_attempts'] = {'status': 'OK', 'detail': f'{failed_logins} failed attempts in 24h (within threshold)', 'count': failed_logins}
                else:
                    checks['login_attempts'] = {'status': 'OK', 'detail': 'No failed login attempts in 24h', 'count': 0}
            except Exception:
                checks['login_attempts'] = {'status': 'OK', 'detail': 'Unable to check login attempts'}

            # Sub-check 4: Fail2Ban ban detection (only show if installed)
            try:
                f2b = self._check_fail2ban_bans()
                if f2b.get('installed', False):
                    f2b_status = f2b.get('status', 'OK')
                    checks['fail2ban'] = {
                        'status': f2b_status,
                        'dismissable': True if f2b_status not in ['OK'] else False,
                        'detail': f2b.get('detail', ''),
                        'installed': True,
                        'banned_count': f2b.get('banned_count', 0)
                    }
                    if f2b.get('status') == 'WARNING':
                        issues.append(f2b.get('detail', 'Fail2Ban bans detected'))
                # If not installed, simply don't add it to checks
            except Exception:
                pass

            # Persist errors and respect dismiss for each sub-check
            dismissed_keys = set()
            security_sub_checks = {
                'security_login_attempts': 'login_attempts',
                'security_certificates': 'certificates',
                'security_uptime': 'uptime',
                'security_fail2ban': 'fail2ban',
            }

            # Inject error_key into each check so the frontend knows which DB key to use
            for err_key, check_name in security_sub_checks.items():
                if check_name in checks:
                    checks[check_name]['error_key'] = err_key

            for err_key, check_name in security_sub_checks.items():
                check_info = checks.get(check_name, {})
                check_status = check_info.get('status', 'OK')
                if check_status not in ('OK', 'INFO'):
                    is_dismissable = check_info.get('dismissable', True)
                    rec_result = health_persistence.record_error(
                        error_key=err_key,
                        category='security',
                        severity=check_status,
                        reason=check_info.get('detail', ''),
                        details={'dismissable': is_dismissable}
                    )
                    if rec_result and rec_result.get('type') == 'skipped_acknowledged':
                        dismissed_keys.add(err_key)
                elif health_persistence.is_error_active(err_key):
                    health_persistence.clear_error(err_key)

            # Rebuild issues excluding dismissed sub-checks
            key_to_check = {
                'security_login_attempts': 'login_attempts',
                'security_certificates': 'certificates',
                'security_uptime': 'uptime',
                'security_fail2ban': 'fail2ban',
            }
            active_issues = []
            for err_key, check_name in key_to_check.items():
                if err_key in dismissed_keys:
                    # Mark as dismissed in checks for the frontend
                    if check_name in checks:
                        checks[check_name]['dismissed'] = True
                    continue
                check_info = checks.get(check_name, {})
                if check_info.get('status', 'OK') not in ('OK', 'INFO'):
                    active_issues.append(check_info.get('detail', ''))

            # Determine overall security status from non-dismissed issues only
            if active_issues:
                has_critical = any(
                    c.get('status') == 'CRITICAL'
                    for k, c in checks.items()
                    if f'security_{k}' not in dismissed_keys
                )
                overall_status = 'CRITICAL' if has_critical else 'WARNING'
                return {
                    'status': overall_status,
                    'reason': '; '.join(active_issues[:2]),
                    'checks': checks
                }

            return {
                'status': 'OK',
                'checks': checks
            }

        except Exception as e:
            print(f"[HealthMonitor] Security check failed: {e}")
            return {'status': 'UNKNOWN', 'reason': f'Security check unavailable: {str(e)}', 'checks': {}, 'dismissable': True}

    def _check_certificates(self) -> Optional[Dict[str, Any]]:
        """
        Check SSL certificate expiration for PVE's default certificate.
        INFO: Self-signed or no cert configured (normal for internal servers)
        WARNING: Expires <30 days
        CRITICAL: Expired
        """
        cache_key = 'certificates'
        current_time = time.time()

        # Cache for 1 day (86400 seconds)
        if cache_key in self.last_check_times:
            if current_time - self.last_check_times[cache_key] < 86400:
                return self.cached_results.get(cache_key)

        try:
            cert_path = '/etc/pve/local/pve-ssl.pem'

            if not os.path.exists(cert_path):
                cert_result = {
                    'status': 'INFO',
                    'reason': 'Self-signed or default PVE certificate'
                }
                self.cached_results[cache_key] = cert_result
                self.last_check_times[cache_key] = current_time
                return cert_result

            # Use openssl to get the expiry date
            result = subprocess.run(
                ['openssl', 'x509', '-enddate', '-noout', '-in', cert_path],
                capture_output=True,
                text=True,
                timeout=2
            )

            if result.returncode == 0:
                date_str = result.stdout.strip().replace('notAfter=', '')

                try:
                    # Parse the date string (format can vary, e.g., 'Jun 15 10:00:00 2024 GMT')
                    # Attempt common formats
                    exp_date = None
                    try:
                        # Try more detailed format first
                        exp_date = datetime.strptime(date_str, '%b %d %H:%M:%S %Y %Z')
                    except ValueError:
                        # Fallback to simpler format if needed
                        try:
                            exp_date = datetime.strptime(date_str, '%b %d %H:%M:%S %Y')
                        except ValueError:
                            # Fallback for "notAfter=..." string itself being the issue
                            if 'notAfter=' in date_str: # If it's the raw string itself
                                pass # Will result in 'INFO' status

                    if exp_date:
                        days_until_expiry = (exp_date - datetime.now()).days

                        if days_until_expiry < 0:
                            status = 'CRITICAL'
                            reason = 'Certificate expired'
                        elif days_until_expiry < 30:
                            status = 'WARNING'
                            reason = f'Certificate expires in {days_until_expiry} days'
                        else:
                            status = 'OK'
                            reason = None

                        cert_result = {'status': status}
                        if reason:
                            cert_result['reason'] = reason

                        self.cached_results[cache_key] = cert_result
                        self.last_check_times[cache_key] = current_time
                        return cert_result
                except Exception as e:
                    print(f"[HealthMonitor] Error parsing certificate expiry date '{date_str}': {e}")
                    # Fall through to return INFO if parsing fails

            # If openssl command failed or date parsing failed
            return {'status': 'INFO', 'reason': 'Certificate check inconclusive'}

        except Exception as e:
            print(f"[HealthMonitor] Error checking certificates: {e}")
            return {'status': 'OK'} # Return OK on exception

    def _check_disk_health_from_events(self) -> Dict[str, Any]:
        """
        Check for disk health warnings/errors from system logs (journalctl).
        Looks for SMART warnings, smartd messages, and specific disk errors.

        Returns dict keyed by '/dev/sdX' with detailed issue info including
        the actual log lines that triggered the warning, so notifications
        and the health monitor show actionable information.
        """
        disk_issues: Dict[str, Any] = {}

        try:
            # Use cached journalctl output to avoid repeated subprocess calls
            journalctl_output = self._get_journalctl_1hour_warnings()

            if not journalctl_output:
                return disk_issues

            # Collect all relevant lines per disk
            # disk_lines[disk_name] = {'smart_lines': [], 'io_lines': [], 'severity': 'WARNING'}
            disk_lines: Dict[str, Dict] = {}

            for line in journalctl_output.split('\n'):
                if not line.strip():
                    continue
                line_lower = line.lower()

                # Extract disk name -- multiple patterns for different log formats:
                #   /dev/sdh, /dev/nvme0n1
                #   Device: /dev/sdh [SAT]  (smartd format)
                #   smartd[1234]: Device: /dev/sdh ...
                disk_match = re.search(
                    r'(?:/dev/|Device:?\s*/dev/)(sd[a-z]+|nvme\d+n\d+|hd[a-z]+)',
                    line)
                if not disk_match:
                    # Fallback for smartd messages that reference disk names differently
                    if 'smartd' in line_lower or 'smart' in line_lower:
                        disk_match = re.search(r'\b(sd[a-z]+|nvme\d+n\d+)\b', line)
                if not disk_match:
                    continue
                disk_name = disk_match.group(1)

                if disk_name not in disk_lines:
                    disk_lines[disk_name] = {
                        'smart_lines': [], 'io_lines': [],
                        'severity': 'WARNING'
                    }

                # Classify the log line
                # SMART warnings: smartd messages, SMART attribute warnings, etc.
                if ('smart' in line_lower and
                    any(kw in line_lower for kw in
                        ['warning', 'error', 'fail', 'exceeded', 'threshold',
                         'reallocat', 'pending', 'uncorrect', 'crc', 'offline',
                         'temperature', 'current_pending', 'reported_uncorrect'])):
                    # Extract the meaningful part of the log line (after hostname)
                    msg_part = line.split(': ', 2)[-1] if ': ' in line else line
                    disk_lines[disk_name]['smart_lines'].append(msg_part.strip())

                # smartd daemon messages (e.g. "smartd[1234]: Device: /dev/sdh ...")
                elif 'smartd' in line_lower:
                    msg_part = line.split(': ', 2)[-1] if ': ' in line else line
                    disk_lines[disk_name]['smart_lines'].append(msg_part.strip())

                # Disk I/O / medium errors
                elif any(kw in line_lower for kw in
                         ['disk error', 'ata error', 'medium error', 'io error',
                          'i/o error', 'blk_update_request', 'sense key']):
                    msg_part = line.split(': ', 2)[-1] if ': ' in line else line
                    disk_lines[disk_name]['io_lines'].append(msg_part.strip())
                    disk_lines[disk_name]['severity'] = 'CRITICAL'

            # Build issues with detailed reasons
            for disk_name, info in disk_lines.items():
                dev_path = f'/dev/{disk_name}'
                smart_lines = info['smart_lines']
                io_lines = info['io_lines']
                severity = info['severity']

                if not smart_lines and not io_lines:
                    continue

                # Skip if disk no longer exists (stale journal entries)
                if not os.path.exists(dev_path):
                    # Also check base device for partitions (e.g., /dev/sda1 -> /dev/sda)
                    base_disk = re.sub(r'\d+$', '', disk_name)
                    base_path = f'/dev/{base_disk}'
                    if not os.path.exists(base_path):
                        continue  # Disk was removed, skip this error

                # Build a descriptive reason from the actual log entries
                # Deduplicate similar messages (keep unique ones)
                seen_msgs = set()
                unique_smart = []
                for msg in smart_lines:
                    # Normalize for dedup: strip timestamps and volatile parts
                    norm = re.sub(r'\d{4}-\d{2}-\d{2}|\d{2}:\d{2}:\d{2}', '', msg).strip()
                    if norm not in seen_msgs:
                        seen_msgs.add(norm)
                        unique_smart.append(msg)

                unique_io = []
                for msg in io_lines:
                    norm = re.sub(r'\d{4}-\d{2}-\d{2}|\d{2}:\d{2}:\d{2}', '', msg).strip()
                    if norm not in seen_msgs:
                        seen_msgs.add(norm)
                        unique_io.append(msg)

                # Compose the reason with actual details
                parts = []
                if unique_smart:
                    if len(unique_smart) == 1:
                        parts.append(unique_smart[0])
                    else:
                        parts.append(f'{len(unique_smart)} SMART warnings')
                        # Include the first 3 most relevant entries
                        for entry in unique_smart[:3]:
                            parts.append(f'  - {entry}')

                if unique_io:
                    if len(unique_io) == 1:
                        parts.append(unique_io[0])
                    else:
                        parts.append(f'{len(unique_io)} I/O errors')
                        for entry in unique_io[:3]:
                            parts.append(f'  - {entry}')

                reason = '\n'.join(parts) if parts else 'SMART/disk warning in system logs'

                # Keep first sample line for observation recording
                sample_line = (unique_smart[0] if unique_smart else
                               unique_io[0] if unique_io else '')

                disk_issues[dev_path] = {
                    'status': severity,
                    'reason': reason,
                    'device': disk_name,
                    'smart_lines': unique_smart[:5],
                    'io_lines': unique_io[:5],
                    'sample': sample_line,
                    'source': 'journal',
                    'dismissable': True,
                    'error_key': f'smart_{disk_name}',
                }

                # Record as disk observation for the permanent history
                try:
                    obs_type = 'smart_error' if unique_smart else 'io_error'
                    # Build a stable signature from the error family, not the volatile details
                    if unique_smart:
                        sig_base = 'smart_journal'
                        # Classify SMART warnings by type
                        all_text = ' '.join(unique_smart).lower()
                        if any(kw in all_text for kw in ['reallocat', 'pending', 'uncorrect']):
                            sig_base = 'smart_sector_issues'
                        elif 'temperature' in all_text:
                            sig_base = 'smart_temperature'
                        elif 'crc' in all_text or 'udma' in all_text:
                            sig_base = 'smart_crc_errors'
                        elif 'fail' in all_text:
                            sig_base = 'smart_test_failed'
                    else:
                        sig_base = 'journal_io_error'

                    obs_sig = f'{sig_base}_{disk_name}'

                    # Try to get serial for proper cross-referencing
                    obs_serial = None
                    try:
                        sm = subprocess.run(
                            ['smartctl', '-i', dev_path],
                            capture_output=True, text=True, timeout=3)
                        if sm.returncode in (0, 4):
                            for sline in sm.stdout.split('\n'):
                                if 'Serial Number' in sline or 'Serial number' in sline:
                                    obs_serial = sline.split(':')[-1].strip()
                                    break
                    except Exception:
                        pass

                    health_persistence.record_disk_observation(
                        device_name=disk_name,
                        serial=obs_serial,
                        error_type=obs_type,
                        error_signature=obs_sig,
                        raw_message=f'/dev/{disk_name}: {reason}',
                        severity=severity.lower(),
                    )
                except Exception:
                    pass

        except Exception as e:
            print(f"[HealthMonitor] Error checking disk health from events: {e}")

        return disk_issues

    def _check_zfs_pool_health(self) -> Dict[str, Any]:
        """
        Check ZFS pool health status using 'zpool status' command.
        Returns dict of pools with non-ONLINE status (DEGRADED, FAULTED, UNAVAIL, etc.).
        """
        zfs_issues = {}

        try:
            # First check if 'zpool' command exists to avoid errors on non-ZFS systems
            result_which = subprocess.run(
                ['which', 'zpool'],
                capture_output=True,
                text=True,
                timeout=1
            )

            if result_which.returncode != 0:
                # ZFS is not installed or 'zpool' command not in PATH, so no ZFS issues to report.
                return zfs_issues

            # Get list of all pools and their health status
            result = subprocess.run(
                ['zpool', 'list', '-H', '-o', 'name,health'], # -H for no header
                capture_output=True,
                text=True,
                timeout=5
            )

            if result.returncode == 0:
                lines = result.stdout.strip().split('\n')
                for line in lines:
                    if not line.strip():
                        continue

                    parts = line.split()
                    if len(parts) >= 2:
                        pool_name = parts[0]
                        pool_health = parts[1].upper() # Ensure uppercase for consistent comparison

                        # 'ONLINE' is the healthy state. Any other status indicates a problem.
                        if pool_health != 'ONLINE':
                            if pool_health in ['DEGRADED', 'FAULTED', 'UNAVAIL', 'REMOVED']:
                                # These are critical states
                                status = 'CRITICAL'
                                reason = f'ZFS pool {pool_health.lower()}'
                            else:
                                # Any other non-ONLINE state is at least a warning
                                status = 'WARNING'
                                reason = f'ZFS pool status: {pool_health.lower()}'

                            # Use a unique key for each pool issue
                            zfs_issues[f'zpool_{pool_name}'] = {
                                'status': status,
                                'reason': reason,
                                'pool_name': pool_name,
                                'health': pool_health
                            }
        except Exception as e:
            print(f"[HealthMonitor] Error checking ZFS pool health: {e}")
            # If 'zpool status' command itself fails, we can't report ZFS issues.
            # Return empty dict as no specific ZFS issues were detected by this check.
            pass

        return zfs_issues

    def _check_proxmox_storage(self) -> Optional[Dict[str, Any]]:
        """
        Check Proxmox storage status using the proxmox_storage_monitor module.
        Detects unavailable storages configured in PVE.
        Returns CRITICAL if any configured storage is unavailable.
        Returns None if the module is not available.

        Respects storage exclusions: excluded storages are reported as INFO, not CRITICAL.

        During startup grace period (first 5 minutes after boot):
        - Storage errors are reported as INFO instead of CRITICAL
        - No persistent errors are recorded
        This prevents false positives when NFS/PBS/remote storage is still mounting.
        """
        if not PROXMOX_STORAGE_AVAILABLE:
            return None

        # Check if we're in startup grace period
        in_grace_period = _is_startup_health_grace()

        try:
            # Reload configuration to ensure we have the latest storage definitions
            proxmox_storage_monitor.reload_configuration()

            # Get the current status of all configured storages
            storage_status = proxmox_storage_monitor.get_storage_status()
            unavailable_storages = storage_status.get('unavailable', [])

            # Get excluded storage names for health monitoring
            excluded_names = health_persistence.get_excluded_storage_names('health')

            # Separate excluded storages from real issues
            excluded_unavailable = [s for s in unavailable_storages if s.get('name', '') in excluded_names]
            real_unavailable = [s for s in unavailable_storages if s.get('name', '') not in excluded_names]

            if not real_unavailable:
                # All non-excluded storages are available. Clear any previously recorded storage errors.
                active_errors = health_persistence.get_active_errors()
                for error in active_errors:
                    if error.get('category') == 'storage' and error.get('error_key', '').startswith('storage_unavailable_'):
                        # Only clear if not an excluded storage
                        storage_name = error.get('error_key', '').replace('storage_unavailable_', '')
                        if storage_name not in excluded_names:
                            health_persistence.clear_error(error['error_key'])

                # Build checks from all configured storages for descriptive display
                available_storages = storage_status.get('available', [])
                checks = {}
                for st in available_storages:
                    st_name = st.get('name', 'unknown')
                    st_type = st.get('type', 'unknown')
                    checks[st_name] = {
                        'status': 'OK',
                        'detail': f'{st_type} storage available'
                    }

                # Add excluded unavailable storages as INFO (not CRITICAL)
                for st in excluded_unavailable:
                    st_name = st.get('name', 'unknown')
                    st_type = st.get('type', 'unknown')
                    checks[st_name] = {
                        'status': 'INFO',
                        'detail': f'{st_type} storage excluded from monitoring',
                        'excluded': True
                    }

                if not checks:
                    checks['proxmox_storages'] = {'status': 'OK', 'detail': 'All storages available'}
                return {'status': 'OK', 'checks': checks}

            storage_details = {}
            # Only process non-excluded unavailable storages as errors
            for storage in real_unavailable:
                storage_name = storage['name']
                error_key = f'storage_unavailable_{storage_name}'
                status_detail = storage.get('status_detail', 'unavailable')

                # Formulate a descriptive reason for the issue
                if status_detail == 'not_found':
                    reason = f"Storage '{storage_name}' is configured but not found on the server."
                elif status_detail == 'unavailable':
                    reason = f"Storage '{storage_name}' is not available (connection error or backend issue)."
                else:
                    reason = f"Storage '{storage_name}' has status: {status_detail}."

                # During grace period, don't record persistent errors (storage may still be mounting)
                # After grace period, record as CRITICAL
                if not in_grace_period:
                    health_persistence.record_error(
                        error_key=error_key,
                        category='storage',
                        severity='CRITICAL',
                        reason=reason,
                        details={
                            'storage_name': storage_name,
                            'storage_type': storage.get('type', 'unknown'),
                            'status_detail': status_detail,
                            'dismissable': False
                        }
                    )

                # Add to details dict with dismissable false for frontend
                storage_details[storage_name] = {
                    'reason': reason,
                    'type': storage.get('type', 'unknown'),
                    'status': status_detail,
                    'dismissable': False
                }

            # Build checks from storage_details
            # During grace period, report as INFO instead of CRITICAL
            checks = {}
            for st_name, st_info in storage_details.items():
                if in_grace_period:
                    checks[st_name] = {
                        'status': 'INFO',
                        'detail': f"[Startup] {st_info.get('reason', 'Unavailable')} (checking...)",
                        'dismissable': False,
                        'grace_period': True
                    }
                else:
                    checks[st_name] = {
                        'status': 'CRITICAL',
                        'detail': st_info.get('reason', 'Unavailable'),
                        'dismissable': False
                    }

            # Add excluded unavailable storages as INFO (not as errors)
            for st in excluded_unavailable:
                st_name = st.get('name', 'unknown')
                st_type = st.get('type', 'unknown')
                checks[st_name] = {
                    'status': 'INFO',
                    'detail': f'{st_type} storage excluded from monitoring (offline)',
                    'excluded': True
                }

            # Also add available storages
            available_list = storage_status.get('available', [])
            unavail_names = {s['name'] for s in unavailable_storages}
            for st in available_list:
                if st.get('name') not in unavail_names and st.get('name') not in checks:
                    checks[st['name']] = {
                        'status': 'OK',
                        'detail': f'{st.get("type", "unknown")} storage available'
                    }

            # Determine overall status based on non-excluded issues only
            if real_unavailable:
                # During grace period, return INFO instead of CRITICAL
                if in_grace_period:
                    return {
                        'status': 'INFO',
                        'reason': f'{len(real_unavailable)} storage(s) not yet available (startup)',
                        'details': storage_details,
                        'checks': checks,
                        'grace_period': True
                    }
                else:
                    return {
                        'status': 'CRITICAL',
                        'reason': f'{len(real_unavailable)} Proxmox storage(s) unavailable',
                        'details': storage_details,
                        'checks': checks
                    }
            else:
                # Only excluded storages are unavailable - this is OK
                return {
                    'status': 'OK',
                    'reason': 'All monitored storages available',
                    'checks': checks
                }

        except Exception as e:
            print(f"[HealthMonitor] Error checking Proxmox storage: {e}")
            # Return None on exception to indicate the check could not be performed, not necessarily a failure.
            return None

    def get_health_status(self) -> Dict[str, Any]:
        """
        Main function to get the comprehensive health status.
        This function orchestrates all individual checks and aggregates results.
        """
        # Trigger all checks, including those with caching
        detailed_status = self.get_detailed_status()
        overall_status = self.get_overall_status()
        system_info = self.get_system_info()

        return {
            'system_info': system_info,
            'overall_health': overall_status,
            'detailed_health': detailed_status,
            'timestamp': datetime.now().isoformat()
        }

    # Duplicate get_detailed_status was removed during refactor (v1.1)


# Global instance
health_monitor = HealthMonitor()