Update notification service

2026-06-03 13:54:41 +00:00 · 2026-03-08 22:47:04 +01:00
parent b8cff3e699
commit 3739560956
7 changed files with 467 additions and 756 deletions
@@ -30,7 +30,6 @@ import {
  ChevronRight,
  Settings2,
  HelpCircle,
-  Usb,
 } from "lucide-react"

 interface CategoryCheck {
@@ -415,44 +414,13 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
  ) => {
    if (!checks || Object.keys(checks).length === 0) return null

-    // Sort checks: non-disk entries first, then disk entries sorted by device name
-    const sortedEntries = Object.entries(checks)
-      .filter(([, checkData]) => checkData.installed !== false)
-      .sort(([keyA, dataA], [keyB, dataB]) => {
-        const isDiskA = dataA.is_disk_entry === true
-        const isDiskB = dataB.is_disk_entry === true
-        if (isDiskA && !isDiskB) return 1
-        if (!isDiskA && isDiskB) return -1
-        if (isDiskA && isDiskB) {
-          // Sort disks by device name
-          const deviceA = dataA.device || keyA
-          const deviceB = dataB.device || keyB
-          return deviceA.localeCompare(deviceB)
-        }
-        return 0
-      })
-
    return (
      <div className="mt-2 space-y-0.5">
-        {sortedEntries.map(([checkKey, checkData]) => {
+        {Object.entries(checks)
+          .filter(([, checkData]) => checkData.installed !== false)
+          .map(([checkKey, checkData]) => {
          const isDismissable = checkData.dismissable === true
          const checkStatus = checkData.status?.toUpperCase() || "OK"
-          const isDiskEntry = checkData.is_disk_entry === true
-
-          // For disk entries, format label specially
-          let displayLabel = formatCheckLabel(checkKey)
-          let diskIcon = null
-          if (isDiskEntry) {
-            displayLabel = checkData.device || checkKey.replace(/_/g, '/')
-            const diskType = checkData.disk_type || ''
-            if (diskType === 'USB') {
-              diskIcon = <Usb className="h-3 w-3 text-orange-400 mr-1" />
-            } else if (diskType === 'NVMe') {
-              diskIcon = <HardDrive className="h-3 w-3 text-blue-400 mr-1" />
-            } else {
-              diskIcon = <HardDrive className="h-3 w-3 text-muted-foreground mr-1" />
-            }
-          }

          return (
            <div
@@ -461,15 +429,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
            >
              <div className="flex items-start gap-1.5 sm:gap-2 min-w-0 flex-1">
                <span className="mt-0.5 shrink-0">{getStatusIcon(checkData.dismissed ? "INFO" : checkData.status, "sm")}</span>
-                <span className="font-medium shrink-0 flex items-center">
-                  {diskIcon}
-                  {displayLabel}
-                  {isDiskEntry && checkData.disk_type && (
-                    <Badge variant="outline" className="ml-1.5 text-[8px] px-1 py-0 h-3.5 shrink-0">
-                      {checkData.disk_type}
-                    </Badge>
-                  )}
-                </span>
+                <span className="font-medium shrink-0">{formatCheckLabel(checkKey)}</span>
                <span className="text-muted-foreground break-words whitespace-pre-wrap min-w-0">{checkData.detail}</span>
                {checkData.dismissed && (
                  <Badge variant="outline" className="text-[9px] px-1 py-0 h-4 shrink-0 text-blue-400 border-blue-400/30">
@@ -499,7 +459,6 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
                    )}
                  </Button>
                )}
-
              </div>
            </div>
          )
@@ -1016,59 +1016,34 @@ export function StorageOverview() {
                    className="sm:hidden border border-white/10 rounded-lg p-4 cursor-pointer bg-white/5 transition-colors"
                    onClick={() => handleDiskClick(disk)}
                  >
-                    <div className="space-y-3">
-                      {/* Header row */}
-                      <div className="flex items-center justify-between">
-                        <div className="flex items-center gap-2">
-                          <Usb className="h-5 w-5 text-orange-400 flex-shrink-0" />
-                          <h3 className="font-semibold">/dev/{disk.name}</h3>
-                          <Badge className="bg-orange-500/10 text-orange-400 border-orange-500/20 text-[10px] px-1.5">USB</Badge>
-                        </div>
-                        <div className="flex items-center gap-2">
+                    <div className="space-y-2 mb-3">
+                      <div className="flex items-center gap-2">
+                        <Usb className="h-5 w-5 text-orange-400 flex-shrink-0" />
+                        <h3 className="font-semibold">/dev/{disk.name}</h3>
+                        <Badge className="bg-orange-500/10 text-orange-400 border-orange-500/20 text-[10px] px-1.5">USB</Badge>
+                      </div>
+                      <div className="flex items-center justify-between gap-3 pl-7">
+                        {disk.model && disk.model !== "Unknown" && (
+                          <p className="text-sm text-muted-foreground truncate flex-1 min-w-0">{disk.model}</p>
+                        )}
+                        <div className="flex items-center gap-3 flex-shrink-0">
                          {disk.temperature > 0 && (
                            <div className="flex items-center gap-1">
-                              <Thermometer className={`h-3.5 w-3.5 ${getTempColor(disk.temperature, disk.name, disk.rotation_rate)}`} />
-                              <span className={`text-xs font-medium ${getTempColor(disk.temperature, disk.name, disk.rotation_rate)}`}>
+                              <Thermometer className={`h-4 w-4 ${getTempColor(disk.temperature, disk.name, disk.rotation_rate)}`} />
+                              <span className={`text-sm font-medium ${getTempColor(disk.temperature, disk.name, disk.rotation_rate)}`}>
                                {disk.temperature}°C
                              </span>
                            </div>
                          )}
                          {getHealthBadge(disk.health)}
+                          {(disk.observations_count ?? 0) > 0 && (
+                            <Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 gap-1 text-[10px] px-1.5 py-0">
+                              <Info className="h-3 w-3" />
+                              {disk.observations_count}
+                            </Badge>
+                          )}
                        </div>
                      </div>
-                      
-                      {/* Model if available */}
-                      {disk.model && disk.model !== "Unknown" && (
-                        <p className="text-sm text-muted-foreground truncate pl-7">{disk.model}</p>
-                      )}
-                      
-                      {/* Info grid - 2 columns */}
-                      <div className="grid grid-cols-2 gap-x-4 gap-y-2 pl-7 text-sm">
-                        <div>
-                          <span className="text-muted-foreground">Size</span>
-                          <p className="font-medium">{disk.size_formatted || disk.size || "N/A"}</p>
-                        </div>
-                        <div>
-                          <span className="text-muted-foreground">SMART Status</span>
-                          <p className="font-medium capitalize">{disk.smart_status || "N/A"}</p>
-                        </div>
-                        {disk.serial && disk.serial !== "Unknown" && (
-                          <div className="col-span-2">
-                            <span className="text-muted-foreground">Serial</span>
-                            <p className="font-medium text-xs truncate">{disk.serial}</p>
-                          </div>
-                        )}
-                      </div>
-                      
-                      {/* Observations badge if any */}
-                      {(disk.observations_count ?? 0) > 0 && (
-                        <div className="pl-7">
-                          <Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 gap-1 text-[10px] px-1.5 py-0">
-                            <Info className="h-3 w-3" />
-                            {disk.observations_count} observation{disk.observations_count > 1 ? 's' : ''}
-                          </Badge>
-                        </div>
-                      )}
                    </div>
                  </div>

@@ -1314,7 +1289,7 @@ export function StorageOverview() {
              </div>

              {/* Observations Section */}
-              {(diskObservations.length > 0 || loadingObservations || (selectedDisk.observations_count ?? 0) > 0) && (
+              {(diskObservations.length > 0 || loadingObservations) && (
                <div className="border-t pt-4">
                  <h4 className="font-semibold mb-2 flex items-center gap-2">
                    <Info className="h-4 w-4 text-blue-400" />
@@ -2554,55 +2554,6 @@ def get_smart_data(disk_name):
        import traceback
        traceback.print_exc()
    
-    # ── Integrate persistent worst_health ──
-    # The health should never improve from a previous worst state without admin intervention.
-    # This prevents disks from showing "healthy" after they had issues that may have auto-resolved.
-    try:
-        current_health = smart_data['health']
-        serial = smart_data.get('serial', '')
-        
-        # Get persistent worst_health
-        worst_info = health_persistence.get_disk_worst_health(disk_name, serial if serial != 'Unknown' else None)
-        
-        if worst_info:
-            worst_health = worst_info.get('worst_health', 'healthy')
-            admin_cleared = worst_info.get('admin_cleared', False)
-            
-            # Only apply worst_health if not cleared by admin
-            if not admin_cleared:
-                severity_order = {'unknown': -1, 'healthy': 0, 'warning': 1, 'critical': 2}
-                current_severity = severity_order.get(current_health, 0)
-                worst_severity = severity_order.get(worst_health, 0)
-                
-                # If worst_health is worse than current, use worst_health
-                if worst_severity > current_severity:
-                    smart_data['health'] = worst_health
-                    smart_data['health_source'] = 'persistent'
-                    smart_data['worst_health_date'] = worst_info.get('worst_health_date')
-                    smart_data['worst_health_reason'] = worst_info.get('worst_health_reason', '')
-        
-        # Update worst_health if current is worse (and not already stored)
-        if current_health in ('warning', 'critical'):
-            health_reason = ''
-            if smart_data.get('pending_sectors', 0) > 0:
-                health_reason = f"{smart_data['pending_sectors']} pending sector(s)"
-            if smart_data.get('reallocated_sectors', 0) > 0:
-                if health_reason:
-                    health_reason += f", {smart_data['reallocated_sectors']} reallocated"
-                else:
-                    health_reason = f"{smart_data['reallocated_sectors']} reallocated sector(s)"
-            if smart_data.get('smart_status') == 'failed':
-                health_reason = 'SMART test FAILED' + (f' ({health_reason})' if health_reason else '')
-            
-            health_persistence.update_disk_worst_health(
-                disk_name, 
-                serial if serial != 'Unknown' else None,
-                current_health,
-                health_reason
-            )
-    except Exception as e:
-        # print(f"[v0] Error integrating worst_health: {e}")
-        pass

    return smart_data

@@ -1034,19 +1034,38 @@ class HealthMonitor:
                io_error_key = f'disk_{device}'
                error_key = f'smart_{device}'
                reason = f'{disk}: {issue["reason"]}'
+                severity = issue.get('status', 'WARNING')
+                
+                # Get serial for this disk to properly track it (important for USB disks)
+                disk_serial = ''
+                disk_model = ''
+                try:
+                    smart_result = subprocess.run(
+                        ['smartctl', '-i', '-j', f'/dev/{device}'],
+                        capture_output=True, text=True, timeout=5
+                    )
+                    if smart_result.returncode in (0, 4):
+                        import json
+                        smart_data = json.loads(smart_result.stdout)
+                        disk_serial = smart_data.get('serial_number', '')
+                        disk_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
+                except Exception:
+                    pass
+                
                try:
                    if (not health_persistence.is_error_active(io_error_key, category='disks') and
                        not health_persistence.is_error_active(error_key, category='disks')):
                        health_persistence.record_error(
                            error_key=error_key,
                            category='disks',
-                            severity=issue.get('status', 'WARNING'),
+                            severity=severity,
                            reason=reason,
                            details={
                                'disk': device,
                                'device': disk,
                                'block_device': device,
-                                'serial': '',
+                                'serial': disk_serial,
+                                'model': disk_model,
                                'smart_status': 'WARNING',
                                'smart_lines': issue.get('smart_lines', []),
                                'io_lines': issue.get('io_lines', []),
@@ -1055,6 +1074,12 @@ class HealthMonitor:
                                'dismissable': True,
                            }
                        )
+                    # Update worst_health for the disk (persists even if current error clears)
+                    # Use serial for proper USB disk tracking
+                    health_persistence.update_disk_worst_health(device, disk_serial if disk_serial else None, severity.lower())
+                    # Also register the disk for observation tracking
+                    if disk_serial:
+                        health_persistence.register_disk(device, disk_serial, disk_model, 0)
                except Exception:
                    pass
        
@@ -1073,16 +1098,205 @@ class HealthMonitor:
                if disk_path not in storage_details or storage_details[disk_path].get('status') == 'OK':
                    issues.append(f'{disk_path}: {disk_info.get("reason", "I/O errors")}')
                    storage_details[disk_path] = disk_info
+                # Update worst_health for I/O errors
+                device = disk_path.replace('/dev/', '')
+                io_severity = disk_info.get('status', 'WARNING').lower()
+                
+                # Get serial for proper disk tracking (important for USB)
+                io_serial = ''
+                io_model = ''
+                try:
+                    smart_result = subprocess.run(
+                        ['smartctl', '-i', '-j', f'/dev/{device}'],
+                        capture_output=True, text=True, timeout=5
+                    )
+                    if smart_result.returncode in (0, 4):
+                        import json
+                        smart_data = json.loads(smart_result.stdout)
+                        io_serial = smart_data.get('serial_number', '')
+                        io_model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
+                except Exception:
+                    pass
+                
+                try:
+                    health_persistence.update_disk_worst_health(device, io_serial if io_serial else None, io_severity)
+                    if io_serial:
+                        health_persistence.register_disk(device, io_serial, io_model, 0)
+                except Exception:
+                    pass
        
-        # Build checks dict from storage_details, adding OK entries for items with no issues
+        # Build checks dict from storage_details
+        # We consolidate disk error entries (like /Dev/Sda) into physical disk entries
+        # and only show disks with problems (not healthy ones).
        checks = {}
+        disk_errors_by_device = {}  # Collect disk errors for consolidation
+        
        for key, val in storage_details.items():
+            # Check if this is a disk device entry (e.g., /Dev/Sda, /dev/sda, sda)
+            key_lower = key.lower()
+            is_disk_entry = (
+                key_lower.startswith('/dev/') or 
+                key_lower.startswith('dev/') or
+                (len(key_lower) <= 10 and (key_lower.startswith('sd') or 
+                 key_lower.startswith('nvme') or key_lower.startswith('hd')))
+            )
+            
+            if is_disk_entry:
+                # Extract device name and collect for consolidation
+                device_name = key_lower.replace('/dev/', '').replace('dev/', '').strip('/')
+                if device_name and len(device_name) <= 15:
+                    if device_name not in disk_errors_by_device:
+                        disk_errors_by_device[device_name] = {
+                            'status': val.get('status', 'WARNING'),
+                            'detail': val.get('reason', ''),
+                            'error_key': val.get('error_key'),
+                            'dismissable': val.get('dismissable', True),
+                        }
+                    else:
+                        # Merge: keep worst status
+                        existing = disk_errors_by_device[device_name]
+                        if val.get('status') == 'CRITICAL':
+                            existing['status'] = 'CRITICAL'
+                        # Append detail if different
+                        new_detail = val.get('reason', '')
+                        if new_detail and new_detail not in existing.get('detail', ''):
+                            existing['detail'] = f"{existing['detail']}; {new_detail}".strip('; ')
+                    continue  # Don't add raw disk error entry, we'll add consolidated later
+            
+            # Non-disk entries go directly to checks
            checks[key] = {
                'status': val.get('status', 'OK'),
                'detail': val.get('reason', 'OK'),
                **{k: v for k, v in val.items() if k not in ('status', 'reason')}
            }
        
+        # Get physical disk info for matching errors to disks
+        # This uses the same detection as flask_server.py /api/storage/info
+        physical_disks = {}
+        try:
+            result = subprocess.run(
+                ['lsblk', '-b', '-d', '-n', '-o', 'NAME,SIZE,TYPE,TRAN'],
+                capture_output=True, text=True, timeout=5
+            )
+            if result.returncode == 0:
+                for line in result.stdout.strip().split('\n'):
+                    if not line.strip():
+                        continue
+                    parts = line.split()
+                    if len(parts) >= 3 and parts[2] == 'disk':
+                        disk_name = parts[0]
+                        # Skip virtual devices
+                        if disk_name.startswith(('zd', 'zram', 'loop', 'ram', 'dm-')):
+                            continue
+                        tran = parts[3].upper() if len(parts) > 3 else ''
+                        is_usb = tran == 'USB'
+                        is_nvme = disk_name.startswith('nvme')
+                        
+                        # Get serial from smartctl
+                        serial = ''
+                        model = ''
+                        try:
+                            smart_result = subprocess.run(
+                                ['smartctl', '-i', '-j', f'/dev/{disk_name}'],
+                                capture_output=True, text=True, timeout=5
+                            )
+                            if smart_result.returncode in (0, 4):  # 4 = SMART not available but info OK
+                                import json
+                                smart_data = json.loads(smart_result.stdout)
+                                serial = smart_data.get('serial_number', '')
+                                model = smart_data.get('model_name', '') or smart_data.get('model_family', '')
+                        except Exception:
+                            pass
+                        
+                        physical_disks[disk_name] = {
+                            'serial': serial,
+                            'model': model,
+                            'is_usb': is_usb,
+                            'is_nvme': is_nvme,
+                            'disk_type': 'USB' if is_usb else ('NVMe' if is_nvme else 'SATA'),
+                        }
+        except Exception:
+            pass
+        
+        # Add consolidated disk entries (only for disks with errors)
+        for device_name, error_info in disk_errors_by_device.items():
+            # Try to find this disk in physical_disks for enriched info
+            disk_info = physical_disks.get(device_name, {})
+            
+            # If not found by name, try to match by serial (from error details)
+            if not disk_info:
+                error_serial = error_info.get('serial', '')
+                if error_serial:
+                    for dk, di in physical_disks.items():
+                        if di.get('serial', '').lower() == error_serial.lower():
+                            disk_info = di
+                            device_name = dk  # Update device name to matched disk
+                            break
+            
+            # Determine disk type
+            disk_type = disk_info.get('disk_type', 'SATA')
+            if not disk_info:
+                # Fallback detection
+                if device_name.startswith('nvme'):
+                    disk_type = 'NVMe'
+                else:
+                    # Check if USB via sysfs
+                    try:
+                        usb_check = subprocess.run(
+                            ['readlink', '-f', f'/sys/block/{device_name}'],
+                            capture_output=True, text=True, timeout=2
+                        )
+                        if 'usb' in usb_check.stdout.lower():
+                            disk_type = 'USB'
+                    except Exception:
+                        pass
+            
+            serial = disk_info.get('serial', '')
+            model = disk_info.get('model', '')
+            
+            # Get worst_health from persistence
+            try:
+                health_status = health_persistence.get_disk_health_status(device_name, serial if serial else None)
+                worst_health = health_status.get('worst_health', 'healthy')
+                
+                # Final health = max(current, worst)
+                health_order = {'healthy': 0, 'ok': 0, 'warning': 1, 'critical': 2}
+                current_level = health_order.get(error_info['status'].lower(), 1)
+                worst_level = health_order.get(worst_health.lower(), 0)
+                
+                if worst_level > current_level:
+                    # worst_health is worse, use it
+                    final_status = worst_health.upper()
+                else:
+                    final_status = error_info['status']
+            except Exception:
+                final_status = error_info['status']
+            
+            # Build detail string with serial/model if available
+            detail = error_info['detail']
+            if serial and serial not in detail:
+                detail = f"{serial} - {detail}"
+            
+            # Create consolidated disk entry
+            check_key = f'/dev/{device_name}'
+            checks[check_key] = {
+                'status': final_status,
+                'detail': detail,
+                'disk_type': disk_type,
+                'device': f'/dev/{device_name}',
+                'serial': serial,
+                'model': model,
+                'error_key': error_info.get('error_key') or f'disk_{device_name}',
+                'dismissable': error_info.get('dismissable', True),
+                'is_disk_entry': True,
+            }
+            
+            # Register disk in persistence if not already (for worst_health tracking)
+            try:
+                health_persistence.register_disk(device_name, serial if serial else None, model, 0)
+            except Exception:
+                pass
+        
        # ALWAYS add descriptive entries for capabilities this server has.
        # When everything is OK, they show as OK.  When there are issues,
        # they still appear so the user can see the full picture (e.g.
@@ -1105,120 +1319,8 @@ class HealthMonitor:
        if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks:
            checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
        
-        # Get physical disks list for UI display
-        physical_disks = self._get_physical_disks_list()
-        
-        # Collect disk error entries (SMART, I/O, etc.) from checks that should be merged with disk entries
-        # These have keys like '/Dev/Sda', '/dev/sda', 'sda', etc.
-        disk_errors_by_device = {}
-        keys_to_remove = []
-        for key, val in checks.items():
-            # Skip non-disk error entries (like lvm_check, root_fs, etc.)
-            key_lower = key.lower()
-            
-            # Check if this looks like a disk error entry
-            is_disk_error = False
-            device_name = None
-            
-            if key_lower.startswith('/dev/') or key_lower.startswith('dev/'):
-                # Keys like '/Dev/Sda', '/dev/sda'
-                device_name = key_lower.replace('/dev/', '').replace('dev/', '').strip('/')
-                is_disk_error = True
-            elif key_lower.startswith('sd') or key_lower.startswith('nvme') or key_lower.startswith('hd'):
-                # Keys like 'sda', 'nvme0n1'
-                device_name = key_lower
-                is_disk_error = True
-            
-            if is_disk_error and device_name and len(device_name) <= 15:
-                # Store the error info, merging if we already have an error for this device
-                if device_name not in disk_errors_by_device:
-                    disk_errors_by_device[device_name] = {
-                        'status': val.get('status', 'WARNING'),
-                        'detail': val.get('detail', val.get('reason', '')),
-                        'error_key': val.get('error_key'),
-                        'dismissable': val.get('dismissable', True),
-                        'dismissed': val.get('dismissed', False),
-                    }
-                else:
-                    # Merge: keep the worst status
-                    existing = disk_errors_by_device[device_name]
-                    if val.get('status') == 'CRITICAL':
-                        existing['status'] = 'CRITICAL'
-                    # Append details
-                    new_detail = val.get('detail', val.get('reason', ''))
-                    if new_detail and new_detail not in existing.get('detail', ''):
-                        existing['detail'] = f"{existing.get('detail', '')}; {new_detail}".strip('; ')
-                keys_to_remove.append(key)
-        
-        # Remove the old disk error entries - they'll be merged into disk entries
-        for key in keys_to_remove:
-            del checks[key]
-        
-        # Add individual disk checks for UI display (like Network interfaces)
-        for disk in physical_disks:
-            device = disk.get('device', '')
-            name = disk.get('name', '')
-            serial = disk.get('serial', '')
-            final_health = disk.get('final_health', 'healthy')
-            final_reason = disk.get('final_reason', '')
-            is_usb = disk.get('is_usb', False)
-            
-            # Format check key - use device path for uniqueness
-            check_key = device.lower().replace('/', '_')  # e.g., _dev_sda
-            
-            # Check if there's a disk error (SMART, I/O, etc.) for this disk
-            disk_error = disk_errors_by_device.get(name.lower())
-            
-            # Determine status - use disk error status if present, otherwise use final_health
-            if disk_error and disk_error.get('status') in ('WARNING', 'CRITICAL'):
-                status = disk_error['status']
-                error_detail = disk_error.get('detail', '')
-            elif final_health == 'critical':
-                status = 'CRITICAL'
-                error_detail = ''
-            elif final_health == 'warning':
-                status = 'WARNING'
-                error_detail = ''
-            else:
-                status = 'OK'
-                error_detail = ''
-            
-            # Build detail string
-            disk_type = 'USB' if is_usb else ('NVMe' if disk.get('is_nvme') else 'SATA')
-            detail = f'{serial}' if serial else 'Unknown serial'
-            if final_reason:
-                detail += f' - {final_reason}'
-            elif error_detail:
-                detail += f' - {error_detail}'
-            
-            # Only add to checks if not already present
-            if check_key not in checks:
-                checks[check_key] = {
-                    'status': status,
-                    'detail': detail,
-                    'device': device,
-                    'serial': serial,
-                    'disk_type': disk_type,
-                    'is_disk_entry': True,  # Flag to identify disk entries in frontend
-                    'worst_health': disk.get('worst_health', 'healthy'),
-                    'worst_health_date': disk.get('worst_health_date'),
-                    'admin_cleared': disk.get('admin_cleared', False),
-                }
-                
-                # If disk has issues, it needs an error_key for dismiss functionality
-                if status != 'OK':
-                    # Use disk error_key if available, otherwise generate one
-                    if disk_error and disk_error.get('error_key'):
-                        checks[check_key]['error_key'] = disk_error['error_key']
-                    else:
-                        checks[check_key]['error_key'] = f'disk_{name}_{serial}' if serial else f'disk_{name}'
-                    checks[check_key]['dismissable'] = True
-                    # Preserve dismissed state from disk error
-                    if disk_error and disk_error.get('dismissed'):
-                        checks[check_key]['dismissed'] = True
-        
        if not issues:
-            return {'status': 'OK', 'checks': checks, 'physical_disks': physical_disks}
+            return {'status': 'OK', 'checks': checks}
        
        # ── Mark dismissed checks ──
        # If an error_key in a check has been acknowledged (dismissed) in the
@@ -1250,7 +1352,6 @@ class HealthMonitor:
                    'reason': '; '.join(issues[:3]),
                    'details': storage_details,
                    'checks': checks,
-                    'physical_disks': physical_disks,
                    'all_dismissed': True,
                }
        except Exception:
@@ -1265,8 +1366,7 @@ class HealthMonitor:
            'status': 'CRITICAL' if has_critical else 'WARNING',
            'reason': '; '.join(issues[:3]),
            'details': storage_details,
-            'checks': checks,
-            'physical_disks': physical_disks
+            'checks': checks
        }
    
    def _check_filesystem(self, mount_point: str) -> Dict[str, Any]:
@@ -1350,221 +1450,9 @@ class HealthMonitor:
                    return {'status': 'OK'} # No VGs found, LVM not in use
            
            return {'status': 'OK', 'volumes': len(volumes)}
-    
+            
        except Exception:
            return {'status': 'OK'}
-
-    def _get_physical_disks_list(self) -> List[Dict[str, Any]]:
-        """Get list of all physical disks with their health status.
-        
-        Combines real-time SMART data with persistent worst_health state.
-        Returns list suitable for display in Health Monitor UI.
-        """
-        disks = []
-        
-        try:
-            # Get all block devices
-            result = subprocess.run(
-                ['lsblk', '-d', '-n', '-o', 'NAME,SIZE,TYPE,TRAN,MODEL,SERIAL'],
-                capture_output=True, text=True, timeout=5
-            )
-            
-            if result.returncode != 0:
-                return []
-            
-            for line in result.stdout.strip().split('\n'):
-                if not line.strip():
-                    continue
-                
-                parts = line.split(None, 5)
-                if len(parts) < 3:
-                    continue
-                
-                name = parts[0]
-                size = parts[1] if len(parts) > 1 else ''
-                dtype = parts[2] if len(parts) > 2 else ''
-                transport = parts[3] if len(parts) > 3 else ''
-                model = parts[4] if len(parts) > 4 else ''
-                serial = parts[5] if len(parts) > 5 else ''
-                
-                # Only include disk type devices
-                if dtype != 'disk':
-                    continue
-                
-                # Skip loop devices, ram disks, etc.
-                if name.startswith(('loop', 'ram', 'zram')):
-                    continue
-                
-                is_usb = transport.lower() == 'usb'
-                is_nvme = name.startswith('nvme')
-                
-                # Get current SMART status
-                current_health = 'healthy'
-                smart_status = 'UNKNOWN'
-                pending_sectors = 0
-                reallocated_sectors = 0
-                
-                try:
-                    dev_path = f'/dev/{name}'
-                    smart_result = subprocess.run(
-                        ['smartctl', '-H', '-A', dev_path],
-                        capture_output=True, text=True, timeout=5
-                    )
-                    
-                    output = smart_result.stdout
-                    
-                    # Check SMART overall status
-                    if 'PASSED' in output:
-                        smart_status = 'PASSED'
-                    elif 'FAILED' in output:
-                        smart_status = 'FAILED'
-                        current_health = 'critical'
-                    
-                    # Parse SMART attributes for pending/reallocated sectors
-                    for attr_line in output.split('\n'):
-                        if 'Current_Pending_Sector' in attr_line or 'Pending_Sector' in attr_line:
-                            parts_attr = attr_line.split()
-                            if parts_attr:
-                                try:
-                                    pending_sectors = int(parts_attr[-1])
-                                except ValueError:
-                                    pass
-                        elif 'Reallocated_Sector' in attr_line:
-                            parts_attr = attr_line.split()
-                            if parts_attr:
-                                try:
-                                    reallocated_sectors = int(parts_attr[-1])
-                                except ValueError:
-                                    pass
-                    
-                    # Determine current health based on sectors
-                    if current_health != 'critical':
-                        if pending_sectors > 10 or reallocated_sectors > 10:
-                            current_health = 'critical'
-                        elif pending_sectors > 0 or reallocated_sectors > 0:
-                            current_health = 'warning'
-                
-                except Exception:
-                    pass
-                
-                # Build health reason
-                health_reason = ''
-                if pending_sectors > 0:
-                    health_reason = f'{pending_sectors} pending sector(s)'
-                if reallocated_sectors > 0:
-                    if health_reason:
-                        health_reason += f', {reallocated_sectors} reallocated'
-                    else:
-                        health_reason = f'{reallocated_sectors} reallocated sector(s)'
-                if smart_status == 'FAILED':
-                    health_reason = 'SMART test FAILED' + (f' ({health_reason})' if health_reason else '')
-                
-                # Get persistent worst_health from database
-                worst_info = health_persistence.get_disk_worst_health(name, serial)
-                worst_health = worst_info.get('worst_health', 'healthy') if worst_info else 'healthy'
-                worst_health_date = worst_info.get('worst_health_date') if worst_info else None
-                worst_health_reason = worst_info.get('worst_health_reason', '') if worst_info else ''
-                admin_cleared = worst_info.get('admin_cleared', False) if worst_info else False
-                
-                # Update worst_health if current is worse
-                if current_health != 'healthy':
-                    updated = health_persistence.update_disk_worst_health(
-                        name, serial, current_health, health_reason
-                    )
-                    if updated:
-                        worst_health = current_health
-                        worst_health_reason = health_reason
-                    
-                    # Record as disk observation (for both internal and USB disks)
-                    # This ensures SMART issues are tracked in observations
-                    try:
-                        obs_type = 'smart_error'
-                        if pending_sectors and pending_sectors > 0:
-                            obs_type = 'pending_sectors'
-                        elif reallocated_sectors and reallocated_sectors > 0:
-                            obs_type = 'reallocated_sectors'
-                        elif smart_status == 'FAILED':
-                            obs_type = 'smart_failed'
-                        
-                        obs_sig = f'smart_{name}_{obs_type}_{pending_sectors}_{reallocated_sectors}'
-                        health_persistence.record_disk_observation(
-                            device_name=name,
-                            serial=serial,
-                            error_type=obs_type,
-                            error_signature=obs_sig,
-                            raw_message=f'/dev/{name}: {health_reason}',
-                            severity=current_health,
-                        )
-                        
-                        # Send smart_warning notification if this is a NEW issue
-                        # (only when updated=True means this is first time seeing this state)
-                        if updated:
-                            try:
-                                from notification_manager import notification_manager
-                                notification_manager.send_notification(
-                                    event_type='smart_warning',
-                                    data={
-                                        'device': f'/dev/{name}',
-                                        'reason': health_reason,
-                                        'serial': serial or 'Unknown',
-                                        'model': model or 'Unknown',
-                                        'pending_sectors': pending_sectors,
-                                        'reallocated_sectors': reallocated_sectors,
-                                        'smart_status': smart_status,
-                                        'hostname': self._hostname,
-                                    }
-                                )
-                            except Exception:
-                                pass
-                    except Exception:
-                        pass
-                
-                # Final health is the worse of current and persistent
-                severity_order = {'healthy': 0, 'warning': 1, 'critical': 2}
-                if severity_order.get(worst_health, 0) > severity_order.get(current_health, 0):
-                    final_health = worst_health
-                    final_reason = worst_health_reason
-                else:
-                    final_health = current_health
-                    final_reason = health_reason
-                
-                # Get active observations count
-                obs = health_persistence.get_disk_observations(device_name=name, serial=serial)
-                active_observations = len(obs) if obs else 0
-                
-                # Register disk in persistence (for tracking)
-                try:
-                    health_persistence.register_disk(name, serial, model)
-                except Exception:
-                    pass
-                
-                disks.append({
-                    'device': f'/dev/{name}',
-                    'name': name,
-                    'serial': serial or '',
-                    'model': model or 'Unknown',
-                    'size': size,
-                    'transport': transport,
-                    'is_usb': is_usb,
-                    'is_nvme': is_nvme,
-                    'smart_status': smart_status,
-                    'current_health': current_health,
-                    'current_health_reason': health_reason,
-                    'worst_health': worst_health,
-                    'worst_health_date': worst_health_date,
-                    'worst_health_reason': worst_health_reason,
-                    'final_health': final_health,
-                    'final_reason': final_reason,
-                    'pending_sectors': pending_sectors,
-                    'reallocated_sectors': reallocated_sectors,
-                    'active_observations': active_observations,
-                    'admin_cleared': admin_cleared,
-                })
-        
-        except Exception as e:
-            print(f"[HealthMonitor] Error getting physical disks list: {e}")
-        
-        return disks
    
    # This function is no longer used in get_detailed_status, but kept for reference if needed.
    # The new _check_proxmox_storage function handles this logic better.
@@ -164,14 +164,25 @@ class HealthPersistence:
                removed INTEGER DEFAULT 0,
                worst_health TEXT DEFAULT 'healthy',
                worst_health_date TEXT,
-                worst_health_reason TEXT,
-                admin_cleared INTEGER DEFAULT 0,
-                admin_cleared_date TEXT,
-                admin_cleared_note TEXT,
+                admin_cleared TEXT,
                UNIQUE(device_name, serial)
            )
        ''')
        
+        # Migration: add worst_health columns if they don't exist (for existing DBs)
+        try:
+            cursor.execute('ALTER TABLE disk_registry ADD COLUMN worst_health TEXT DEFAULT "healthy"')
+        except Exception:
+            pass
+        try:
+            cursor.execute('ALTER TABLE disk_registry ADD COLUMN worst_health_date TEXT')
+        except Exception:
+            pass
+        try:
+            cursor.execute('ALTER TABLE disk_registry ADD COLUMN admin_cleared TEXT')
+        except Exception:
+            pass
+        
        # Observation log: deduplicated error events per disk
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS disk_observations (
@@ -195,17 +206,6 @@ class HealthPersistence:
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_disk ON disk_observations(disk_registry_id)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_dismissed ON disk_observations(dismissed)')
        
-        # Migration: add worst_health columns to disk_registry if not present
-        cursor.execute("PRAGMA table_info(disk_registry)")
-        disk_columns = [col[1] for col in cursor.fetchall()]
-        if 'worst_health' not in disk_columns:
-            cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health TEXT DEFAULT 'healthy'")
-            cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health_date TEXT")
-            cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health_reason TEXT")
-            cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared INTEGER DEFAULT 0")
-            cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared_date TEXT")
-            cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared_note TEXT")
-        
        conn.commit()
        conn.close()
    
@@ -1231,26 +1231,11 @@ class HealthPersistence:
            # a different device_name (e.g. 'ata8' instead of 'sdh'),
            # update that entry's device_name so observations carry over.
            if serial:
-                # Try exact match first
                cursor.execute('''
                    SELECT id, device_name FROM disk_registry
                    WHERE serial = ? AND serial != '' AND device_name != ?
                ''', (serial, device_name))
                old_rows = cursor.fetchall()
-                
-                # If no exact match, try normalized match (for USB disks with special chars)
-                if not old_rows:
-                    normalized = self._normalize_serial(serial)
-                    if normalized and normalized != serial:
-                        cursor.execute(
-                            'SELECT id, device_name, serial FROM disk_registry '
-                            'WHERE serial != "" AND device_name != ?', (device_name,))
-                        for row in cursor.fetchall():
-                            db_normalized = self._normalize_serial(row[2])
-                            if db_normalized == normalized or normalized in db_normalized or db_normalized in normalized:
-                                old_rows.append((row[0], row[1]))
-                                break
-                
                for old_id, old_dev in old_rows:
                    # Only consolidate ATA names -> block device names
                    if old_dev.startswith('ata') and not device_name.startswith('ata'):
@@ -1288,23 +1273,6 @@ class HealthPersistence:
        except Exception as e:
            print(f"[HealthPersistence] Error registering disk {device_name}: {e}")

-    def _normalize_serial(self, serial: str) -> str:
-        """Normalize serial number for comparison.
-        
-        USB disks can have serials with escape sequences like \\x06\\x18
-        or non-printable characters. This normalizes them for matching.
-        """
-        if not serial:
-            return ''
-        import re
-        # Remove escape sequences like \x06, \x18
-        normalized = re.sub(r'\\x[0-9a-fA-F]{2}', '', serial)
-        # Remove non-printable characters
-        normalized = ''.join(c for c in normalized if c.isprintable())
-        # Remove common prefixes that vary
-        normalized = normalized.strip()
-        return normalized
-
    def _get_disk_registry_id(self, cursor, device_name: str,
                               serial: Optional[str] = None) -> Optional[int]:
        """Find disk_registry.id, matching by serial first, then device_name.
@@ -1313,25 +1281,12 @@ class HealthPersistence:
        checks entries with ATA names that share the same serial.
        """
        if serial:
-            # Try exact match first
            cursor.execute(
                'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1',
                (serial,))
            row = cursor.fetchone()
            if row:
                return row[0]
-            
-            # Try normalized serial match (for USB disks with special chars)
-            normalized = self._normalize_serial(serial)
-            if normalized and normalized != serial:
-                # Search for serials that start with or contain the normalized version
-                cursor.execute(
-                    'SELECT id, serial FROM disk_registry WHERE serial != "" ORDER BY last_seen DESC')
-                for row in cursor.fetchall():
-                    db_normalized = self._normalize_serial(row[1])
-                    if db_normalized == normalized or normalized in db_normalized or db_normalized in normalized:
-                        return row[0]
-        
        # Fallback: match by device_name (strip /dev/ prefix)
        clean_dev = device_name.replace('/dev/', '')
        cursor.execute(
@@ -1340,7 +1295,6 @@ class HealthPersistence:
        row = cursor.fetchone()
        if row:
            return row[0]
-        
        # Last resort: search for ATA-named entries that might refer to this device
        # This handles cases where observations were recorded under 'ata8'
        # but we're querying for 'sdh'
@@ -1353,6 +1307,131 @@ class HealthPersistence:
            pass
        return None

+    def update_disk_worst_health(self, device_name: str, serial: Optional[str],
+                                   new_health: str) -> bool:
+        """Update worst_health if new_health is worse than current.
+        
+        Health hierarchy: healthy < warning < critical
+        Only escalates, never downgrades automatically.
+        
+        Returns True if worst_health was updated.
+        """
+        health_order = {'healthy': 0, 'warning': 1, 'critical': 2}
+        new_level = health_order.get(new_health.lower(), 0)
+        
+        if new_level == 0:  # healthy never updates worst_health
+            return False
+        
+        now = datetime.now().isoformat()
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            disk_id = self._get_disk_registry_id(cursor, device_name, serial)
+            if not disk_id:
+                # Register disk first
+                self.register_disk(device_name.replace('/dev/', ''), serial)
+                disk_id = self._get_disk_registry_id(cursor, device_name, serial)
+            
+            if not disk_id:
+                conn.close()
+                return False
+            
+            # Get current worst_health
+            cursor.execute('SELECT worst_health FROM disk_registry WHERE id = ?', (disk_id,))
+            row = cursor.fetchone()
+            current_worst = row[0] if row and row[0] else 'healthy'
+            current_level = health_order.get(current_worst.lower(), 0)
+            
+            # Only update if new health is worse
+            if new_level > current_level:
+                cursor.execute('''
+                    UPDATE disk_registry 
+                    SET worst_health = ?, worst_health_date = ?, admin_cleared = NULL
+                    WHERE id = ?
+                ''', (new_health.lower(), now, disk_id))
+                conn.commit()
+                conn.close()
+                return True
+            
+            conn.close()
+            return False
+        except Exception as e:
+            print(f"[HealthPersistence] Error updating worst_health for {device_name}: {e}")
+            return False
+
+    def get_disk_health_status(self, device_name: str, serial: Optional[str] = None) -> Dict[str, Any]:
+        """Get the health status of a disk including worst_health.
+        
+        Returns dict with:
+          - worst_health: 'healthy', 'warning', or 'critical'
+          - worst_health_date: ISO timestamp when worst_health was set
+          - admin_cleared: ISO timestamp if admin manually cleared the health
+          - observations_count: Number of recorded observations
+        """
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            disk_id = self._get_disk_registry_id(cursor, device_name, serial)
+            if not disk_id:
+                conn.close()
+                return {'worst_health': 'healthy', 'observations_count': 0}
+            
+            cursor.execute('''
+                SELECT worst_health, worst_health_date, admin_cleared
+                FROM disk_registry WHERE id = ?
+            ''', (disk_id,))
+            row = cursor.fetchone()
+            
+            # Count observations
+            cursor.execute(
+                'SELECT COUNT(*) FROM disk_observations WHERE disk_registry_id = ? AND dismissed = 0',
+                (disk_id,))
+            obs_count = cursor.fetchone()[0]
+            
+            conn.close()
+            
+            if row:
+                return {
+                    'worst_health': row[0] or 'healthy',
+                    'worst_health_date': row[1],
+                    'admin_cleared': row[2],
+                    'observations_count': obs_count
+                }
+            return {'worst_health': 'healthy', 'observations_count': obs_count}
+        except Exception as e:
+            print(f"[HealthPersistence] Error getting disk health for {device_name}: {e}")
+            return {'worst_health': 'healthy', 'observations_count': 0}
+
+    def clear_disk_health_history(self, device_name: str, serial: Optional[str] = None) -> bool:
+        """Admin action: clear worst_health back to healthy.
+        
+        This resets the health status but keeps all observations for audit.
+        Records when the admin cleared it for accountability.
+        """
+        now = datetime.now().isoformat()
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            disk_id = self._get_disk_registry_id(cursor, device_name, serial)
+            if not disk_id:
+                conn.close()
+                return False
+            
+            cursor.execute('''
+                UPDATE disk_registry 
+                SET worst_health = 'healthy', worst_health_date = NULL, admin_cleared = ?
+                WHERE id = ?
+            ''', (now, disk_id))
+            conn.commit()
+            conn.close()
+            return True
+        except Exception as e:
+            print(f"[HealthPersistence] Error clearing health for {device_name}: {e}")
+            return False
+
    def record_disk_observation(self, device_name: str, serial: Optional[str],
                                 error_type: str, error_signature: str,
                                 raw_message: str = '',
@@ -1391,6 +1470,10 @@ class HealthPersistence:
            
            conn.commit()
            conn.close()
+            
+            # Update worst_health based on observation severity
+            self.update_disk_worst_health(clean_dev, serial, severity)
+            
        except Exception as e:
            print(f"[HealthPersistence] Error recording disk observation: {e}")

@@ -1539,186 +1622,6 @@ class HealthPersistence:
        except Exception as e:
            print(f"[HealthPersistence] Error marking removed disks: {e}")

-    # ────────────────────────────────────────────────────────────────
-    #  Disk Worst Health State Tracking
-    # ────────────────────────────────────────────────────────────────
-    
-    HEALTH_SEVERITY_ORDER = {'healthy': 0, 'warning': 1, 'critical': 2}
-    
-    def update_disk_worst_health(self, device_name: str, serial: Optional[str],
-                                  health: str, reason: str = '') -> bool:
-        """Update worst_health if the new health is worse than current.
-        
-        Health progression is one-way: healthy -> warning -> critical
-        Only admin_clear_disk_health() can reset to healthy.
-        
-        Returns True if worst_health was updated.
-        """
-        health_lower = health.lower()
-        if health_lower not in self.HEALTH_SEVERITY_ORDER:
-            return False
-        
-        try:
-            conn = self._get_conn()
-            cursor = conn.cursor()
-            
-            disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
-            if not disk_id:
-                # Auto-register disk if not present
-                self.register_disk(device_name.replace('/dev/', ''), serial)
-                disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
-            
-            if not disk_id:
-                conn.close()
-                return False
-            
-            # Get current worst_health
-            cursor.execute('SELECT worst_health, admin_cleared FROM disk_registry WHERE id = ?', (disk_id,))
-            row = cursor.fetchone()
-            if not row:
-                conn.close()
-                return False
-            
-            current_worst = row[0] or 'healthy'
-            admin_cleared = row[1] or 0
-            
-            # If admin cleared and new issue is the same or less severe, don't update
-            # But if admin cleared and issue escalates, update anyway
-            current_severity = self.HEALTH_SEVERITY_ORDER.get(current_worst, 0)
-            new_severity = self.HEALTH_SEVERITY_ORDER.get(health_lower, 0)
-            
-            # Only update if new health is worse
-            if new_severity > current_severity:
-                now = datetime.now().isoformat()
-                cursor.execute('''
-                    UPDATE disk_registry 
-                    SET worst_health = ?, worst_health_date = ?, worst_health_reason = ?,
-                        admin_cleared = 0
-                    WHERE id = ?
-                ''', (health_lower, now, reason, disk_id))
-                conn.commit()
-                conn.close()
-                return True
-            
-            conn.close()
-            return False
-        except Exception as e:
-            print(f"[HealthPersistence] Error updating disk worst_health: {e}")
-            return False
-    
-    def get_disk_worst_health(self, device_name: str, serial: Optional[str] = None) -> Optional[Dict[str, Any]]:
-        """Get the worst health state for a specific disk."""
-        try:
-            conn = self._get_conn()
-            cursor = conn.cursor()
-            
-            disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
-            if not disk_id:
-                conn.close()
-                return None
-            
-            cursor.execute('''
-                SELECT worst_health, worst_health_date, worst_health_reason, 
-                       admin_cleared, admin_cleared_date, admin_cleared_note
-                FROM disk_registry WHERE id = ?
-            ''', (disk_id,))
-            row = cursor.fetchone()
-            conn.close()
-            
-            if row:
-                return {
-                    'worst_health': row[0] or 'healthy',
-                    'worst_health_date': row[1],
-                    'worst_health_reason': row[2],
-                    'admin_cleared': bool(row[3]),
-                    'admin_cleared_date': row[4],
-                    'admin_cleared_note': row[5],
-                }
-            return None
-        except Exception as e:
-            print(f"[HealthPersistence] Error getting disk worst_health: {e}")
-            return None
-    
-    def admin_clear_disk_health(self, device_name: str, serial: Optional[str], note: str) -> bool:
-        """Admin manually clears disk health history (e.g., after disk replacement).
-        
-        Requires a note explaining why (for audit trail).
-        """
-        if not note or len(note.strip()) < 5:
-            return False  # Require meaningful note
-        
-        try:
-            conn = self._get_conn()
-            cursor = conn.cursor()
-            
-            disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
-            if not disk_id:
-                conn.close()
-                return False
-            
-            now = datetime.now().isoformat()
-            cursor.execute('''
-                UPDATE disk_registry 
-                SET worst_health = 'healthy', admin_cleared = 1, 
-                    admin_cleared_date = ?, admin_cleared_note = ?
-                WHERE id = ?
-            ''', (now, note.strip(), disk_id))
-            
-            # Also dismiss all active observations for this disk
-            cursor.execute('''
-                UPDATE disk_observations SET dismissed = 1 WHERE disk_registry_id = ?
-            ''', (disk_id,))
-            
-            conn.commit()
-            conn.close()
-            return True
-        except Exception as e:
-            print(f"[HealthPersistence] Error clearing disk health: {e}")
-            return False
-    
-    def get_all_disks_health_summary(self) -> List[Dict[str, Any]]:
-        """Get health summary for all registered disks (for Health Monitor listing).
-        
-        Returns list of disks with their current and worst health states.
-        """
-        try:
-            conn = self._get_conn()
-            cursor = conn.cursor()
-            
-            cursor.execute('''
-                SELECT d.id, d.device_name, d.serial, d.model, d.size_bytes,
-                       d.first_seen, d.last_seen, d.removed,
-                       d.worst_health, d.worst_health_date, d.worst_health_reason,
-                       d.admin_cleared, d.admin_cleared_date,
-                       (SELECT COUNT(*) FROM disk_observations o 
-                        WHERE o.disk_registry_id = d.id AND o.dismissed = 0) as active_observations
-                FROM disk_registry d
-                WHERE d.removed = 0
-                ORDER BY d.device_name
-            ''')
-            rows = cursor.fetchall()
-            conn.close()
-            
-            return [{
-                'id': r[0],
-                'device_name': r[1],
-                'serial': r[2] or '',
-                'model': r[3] or 'Unknown',
-                'size_bytes': r[4],
-                'first_seen': r[5],
-                'last_seen': r[6],
-                'removed': bool(r[7]),
-                'worst_health': r[8] or 'healthy',
-                'worst_health_date': r[9],
-                'worst_health_reason': r[10] or '',
-                'admin_cleared': bool(r[11]),
-                'admin_cleared_date': r[12],
-                'active_observations': r[13],
-            } for r in rows]
-        except Exception as e:
-            print(f"[HealthPersistence] Error getting disks health summary: {e}")
-            return []
-

 # Global instance
 health_persistence = HealthPersistence()
@@ -402,16 +402,47 @@ class JournalWatcher:
                    entity = 'disk'
                    entity_id = f'fs_{device}'
                    
-                    # ── 24h dedup for filesystem errors per device ──
+                    # ── Get disk serial for USB-aware cooldown ──
+                    # USB disks can change device names (sda->sdb) on reconnect.
+                    # Using serial as cooldown key ensures same physical disk
+                    # shares one 24h cooldown regardless of device letter.
+                    import os as _os
+                    base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else ''
+                    disk_serial = ''
+                    is_usb_disk = False
+                    if base_dev:
+                        try:
+                            # Check if USB via sysfs
+                            sysfs_link = subprocess.run(
+                                ['readlink', '-f', f'/sys/block/{base_dev}'],
+                                capture_output=True, text=True, timeout=2
+                            )
+                            is_usb_disk = 'usb' in sysfs_link.stdout.lower()
+                            
+                            # Get serial from smartctl
+                            smart_result = subprocess.run(
+                                ['smartctl', '-i', '-j', f'/dev/{base_dev}'],
+                                capture_output=True, text=True, timeout=5
+                            )
+                            if smart_result.returncode in (0, 4):
+                                import json
+                                smart_data = json.loads(smart_result.stdout)
+                                disk_serial = smart_data.get('serial_number', '')
+                        except Exception:
+                            pass
+                    
+                    # ── 24h dedup for filesystem errors ──
+                    # Use serial for USB disks, device name for others
                    now_fs = time.time()
-                    fs_dedup_key = f'fs_{device}'
+                    if is_usb_disk and disk_serial:
+                        fs_dedup_key = f'fs_serial_{disk_serial}'
+                    else:
+                        fs_dedup_key = f'fs_{device}'
                    last_fs_notified = self._disk_io_notified.get(fs_dedup_key, 0)
                    if now_fs - last_fs_notified < self._DISK_IO_COOLDOWN:
                        return  # Already notified for this device recently
                    
-                    # ── SMART + device existence gating ──
-                    import os as _os
-                    base_dev = re.sub(r'\d+$', '', device) if device != 'unknown' else ''
+                    # ── Device existence gating ──
                    device_exists = base_dev and _os.path.exists(f'/dev/{base_dev}')
                    
                    if not device_exists and device != 'unknown':
@@ -749,7 +780,6 @@ class JournalWatcher:
        """Extract device info from a smartd system-mail and record as disk observation."""
        try:
            import re as _re
-            import subprocess
            from health_persistence import health_persistence
            
            # Extract device path: "Device: /dev/sdh [SAT]" or "Device: /dev/sda"
@@ -770,21 +800,6 @@ class JournalWatcher:
            if model_match:
                model = model_match.group(1).strip()
            
-            # If no serial from message, try to get it from smartctl (important for USB disks)
-            if not serial or len(serial) < 3:
-                try:
-                    result = subprocess.run(
-                        ['smartctl', '-i', '-j', f'/dev/{base_dev}'],
-                        capture_output=True, text=True, timeout=5
-                    )
-                    import json as _json
-                    data = _json.loads(result.stdout)
-                    serial = data.get('serial_number', '') or serial
-                    if not model:
-                        model = data.get('model_name', '') or data.get('model_family', '')
-                except Exception:
-                    pass
-            
            # Extract error signature from title: "SMART error (FailedReadSmartSelfTestLog)"
            sig_match = _re.search(r'SMART error\s*\((\w+)\)', title)
            if sig_match:
@@ -821,12 +836,10 @@ class JournalWatcher:
                severity='warning',
            )
            
-            # Also update worst_health so the disk stays marked as warning
-            # even if current SMART readings show 0 pending sectors
-            warn_line_text = warn_line_m.group(1).strip() if warn_line_m else error_signature
-            health_persistence.update_disk_worst_health(
-                base_dev, serial, 'warning', warn_line_text
-            )
+            # Update worst_health for permanent tracking (record_disk_observation 
+            # already does this, but we ensure it here for safety)
+            health_persistence.update_disk_worst_health(base_dev, serial, 'warning')
+            
        except Exception as e:
            print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}")

@@ -1751,8 +1764,26 @@ class PollingCollector:
                if isinstance(details_raw, dict):
                    # Extract device name for a stable entity_id (24h cooldown key)
                    dev = details_raw.get('device', details_raw.get('disk', ''))
-                    if dev:
-                        eid = f'disk_{dev}'  # Stable per-device fingerprint
+                    serial = details_raw.get('serial', '')
+                    
+                    # For USB disks, use serial as entity_id for stable cooldown
+                    # USB disks can change device names (sda->sdb) on reconnect
+                    # Using serial ensures same physical disk shares cooldown
+                    if serial and dev:
+                        # Check if this is a USB disk
+                        try:
+                            sysfs_result = subprocess.run(
+                                ['readlink', '-f', f'/sys/block/{dev.replace("/dev/", "")}'],
+                                capture_output=True, text=True, timeout=2
+                            )
+                            if 'usb' in sysfs_result.stdout.lower():
+                                eid = f'disk_serial_{serial}'  # USB: use serial
+                            else:
+                                eid = f'disk_{dev}'  # Non-USB: use device name
+                        except Exception:
+                            eid = f'disk_{dev}'  # Fallback to device name
+                    elif dev:
+                        eid = f'disk_{dev}'  # No serial: use device name
            
            # Updates are always informational notifications except
            # system_age which can be WARNING (365+ days) or CRITICAL (548+ days).
@@ -1818,15 +1849,26 @@ class PollingCollector:
            except Exception:
                pass
            
-            # Skip recovery notifications for SMART disk errors (pending/reallocated sectors).
-            # These indicate physical disk degradation that doesn't truly "recover" --
-            # the disk may show 0 pending sectors later but the damage history persists.
-            # The worst_health in disk_registry tracks this, so we don't send false "resolved".
+            # Skip recovery notifications for PERMANENT disk events.
+            # These indicate physical disk degradation that doesn't truly "recover":
+            # - SMART pending/reallocated sectors indicate physical damage
+            # - Disk may show 0 pending sectors later but damage history persists
+            # - Sending "Resolved" gives false sense of security
+            # The worst_health in disk_registry tracks this permanently.
            if category == 'disks':
-                reason_lower = reason.lower() if reason else ''
-                if any(indicator in reason_lower for indicator in [
-                    'pending', 'reallocated', 'sector', 'smart', 'unreadable'
-                ]):
+                reason_lower = (reason or '').lower()
+                permanent_indicators = [
+                    'pending',           # pending sectors
+                    'reallocated',       # reallocated sectors  
+                    'unreadable',        # unreadable sectors
+                    'smart',             # SMART errors
+                    'surface error',     # disk surface errors
+                    'bad sector',        # bad sectors
+                    'i/o error',         # I/O errors (repeated)
+                    'medium error',      # SCSI medium errors
+                ]
+                if any(indicator in reason_lower for indicator in permanent_indicators):
+                    # Don't send recovery - just clean up tracking
                    self._last_notified.pop(key, None)
                    continue
            
@@ -559,13 +559,6 @@ TEMPLATES = {
        'group': 'storage',
        'default_enabled': True,
    },
-    'smart_warning': {
-        'title': '{hostname}: SMART warning on {device}',
-        'body': '{device}: {reason}',
-        'label': 'SMART warning (sectors)',
-        'group': 'storage',
-        'default_enabled': True,
-    },
    'storage_unavailable': {
        'title': '{hostname}: Storage unavailable - {storage_name}',
        'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',