Update notification service

2026-05-31 12:34:48 +00:00 · 2026-03-08 18:15:36 +01:00
parent 8c51957bfa
commit 1ea28d66df
7 changed files with 611 additions and 31 deletions
@@ -30,6 +30,7 @@ import {
  ChevronRight,
  Settings2,
  HelpCircle,
+  Usb,
 } from "lucide-react"

 interface CategoryCheck {
@@ -414,13 +415,44 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
  ) => {
    if (!checks || Object.keys(checks).length === 0) return null

+    // Sort checks: non-disk entries first, then disk entries sorted by device name
+    const sortedEntries = Object.entries(checks)
+      .filter(([, checkData]) => checkData.installed !== false)
+      .sort(([keyA, dataA], [keyB, dataB]) => {
+        const isDiskA = dataA.is_disk_entry === true
+        const isDiskB = dataB.is_disk_entry === true
+        if (isDiskA && !isDiskB) return 1
+        if (!isDiskA && isDiskB) return -1
+        if (isDiskA && isDiskB) {
+          // Sort disks by device name
+          const deviceA = dataA.device || keyA
+          const deviceB = dataB.device || keyB
+          return deviceA.localeCompare(deviceB)
+        }
+        return 0
+      })
+
    return (
      <div className="mt-2 space-y-0.5">
-        {Object.entries(checks)
-          .filter(([, checkData]) => checkData.installed !== false)
-          .map(([checkKey, checkData]) => {
+        {sortedEntries.map(([checkKey, checkData]) => {
          const isDismissable = checkData.dismissable === true
          const checkStatus = checkData.status?.toUpperCase() || "OK"
+          const isDiskEntry = checkData.is_disk_entry === true
+
+          // For disk entries, format label specially
+          let displayLabel = formatCheckLabel(checkKey)
+          let diskIcon = null
+          if (isDiskEntry) {
+            displayLabel = checkData.device || checkKey.replace(/_/g, '/')
+            const diskType = checkData.disk_type || ''
+            if (diskType === 'USB') {
+              diskIcon = <Usb className="h-3 w-3 text-orange-400 mr-1" />
+            } else if (diskType === 'NVMe') {
+              diskIcon = <HardDrive className="h-3 w-3 text-blue-400 mr-1" />
+            } else {
+              diskIcon = <HardDrive className="h-3 w-3 text-muted-foreground mr-1" />
+            }
+          }

          return (
            <div
@@ -429,7 +461,15 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
            >
              <div className="flex items-start gap-1.5 sm:gap-2 min-w-0 flex-1">
                <span className="mt-0.5 shrink-0">{getStatusIcon(checkData.dismissed ? "INFO" : checkData.status, "sm")}</span>
-                <span className="font-medium shrink-0">{formatCheckLabel(checkKey)}</span>
+                <span className="font-medium shrink-0 flex items-center">
+                  {diskIcon}
+                  {displayLabel}
+                  {isDiskEntry && checkData.disk_type && (
+                    <Badge variant="outline" className="ml-1.5 text-[8px] px-1 py-0 h-3.5 shrink-0">
+                      {checkData.disk_type}
+                    </Badge>
+                  )}
+                </span>
                <span className="text-muted-foreground break-words whitespace-pre-wrap min-w-0">{checkData.detail}</span>
                {checkData.dismissed && (
                  <Badge variant="outline" className="text-[9px] px-1 py-0 h-4 shrink-0 text-blue-400 border-blue-400/30">
@@ -459,6 +499,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
                    )}
                  </Button>
                )}
+
              </div>
            </div>
          )
@@ -169,8 +169,8 @@ const generateLatencyReport = (report: ReportData) => {
    endTime: new Date(report.data[report.data.length - 1].timestamp * 1000).toLocaleString(),
  } : null

-  // Build history table rows for gateway mode (last 24 records)
-  const historyTableRows = report.data.slice(-24).map((d, i) => `
+  // Build history table rows for gateway mode (last 20 records)
+  const historyTableRows = report.data.slice(-20).map((d, i) => `
    <tr${d.packet_loss && d.packet_loss > 0 ? ' class="warn"' : ''}>
      <td>${i + 1}</td>
      <td>${new Date(d.timestamp * 1000).toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' })}</td>
@@ -614,7 +614,7 @@ const generateLatencyReport = (report: ReportData) => {
 ${!report.isRealtime && report.data.length > 0 ? `
 <!-- 5. Detailed History (for Gateway) -->
 <div class="section">
-  <div class="section-title">5. Latency History (Last ${Math.min(24, report.data.length)} Records)</div>
+  <div class="section-title">5. Latency History (Last ${Math.min(20, report.data.length)} Records)</div>
  <table class="chk-tbl">
  <thead>
  <tr>
@@ -1016,34 +1016,59 @@ export function StorageOverview() {
                    className="sm:hidden border border-white/10 rounded-lg p-4 cursor-pointer bg-white/5 transition-colors"
                    onClick={() => handleDiskClick(disk)}
                  >
-                    <div className="space-y-2 mb-3">
-                      <div className="flex items-center gap-2">
-                        <Usb className="h-5 w-5 text-orange-400 flex-shrink-0" />
-                        <h3 className="font-semibold">/dev/{disk.name}</h3>
-                        <Badge className="bg-orange-500/10 text-orange-400 border-orange-500/20 text-[10px] px-1.5">USB</Badge>
-                      </div>
-                      <div className="flex items-center justify-between gap-3 pl-7">
-                        {disk.model && disk.model !== "Unknown" && (
-                          <p className="text-sm text-muted-foreground truncate flex-1 min-w-0">{disk.model}</p>
-                        )}
-                        <div className="flex items-center gap-3 flex-shrink-0">
+                    <div className="space-y-3">
+                      {/* Header row */}
+                      <div className="flex items-center justify-between">
+                        <div className="flex items-center gap-2">
+                          <Usb className="h-5 w-5 text-orange-400 flex-shrink-0" />
+                          <h3 className="font-semibold">/dev/{disk.name}</h3>
+                          <Badge className="bg-orange-500/10 text-orange-400 border-orange-500/20 text-[10px] px-1.5">USB</Badge>
+                        </div>
+                        <div className="flex items-center gap-2">
                          {disk.temperature > 0 && (
                            <div className="flex items-center gap-1">
-                              <Thermometer className={`h-4 w-4 ${getTempColor(disk.temperature, disk.name, disk.rotation_rate)}`} />
-                              <span className={`text-sm font-medium ${getTempColor(disk.temperature, disk.name, disk.rotation_rate)}`}>
+                              <Thermometer className={`h-3.5 w-3.5 ${getTempColor(disk.temperature, disk.name, disk.rotation_rate)}`} />
+                              <span className={`text-xs font-medium ${getTempColor(disk.temperature, disk.name, disk.rotation_rate)}`}>
                                {disk.temperature}°C
                              </span>
                            </div>
                          )}
                          {getHealthBadge(disk.health)}
-                          {(disk.observations_count ?? 0) > 0 && (
-                            <Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 gap-1 text-[10px] px-1.5 py-0">
-                              <Info className="h-3 w-3" />
-                              {disk.observations_count}
-                            </Badge>
-                          )}
                        </div>
                      </div>
+                      
+                      {/* Model if available */}
+                      {disk.model && disk.model !== "Unknown" && (
+                        <p className="text-sm text-muted-foreground truncate pl-7">{disk.model}</p>
+                      )}
+                      
+                      {/* Info grid - 2 columns */}
+                      <div className="grid grid-cols-2 gap-x-4 gap-y-2 pl-7 text-sm">
+                        <div>
+                          <span className="text-muted-foreground">Size</span>
+                          <p className="font-medium">{disk.size || "N/A"}</p>
+                        </div>
+                        <div>
+                          <span className="text-muted-foreground">SMART Status</span>
+                          <p className="font-medium">{disk.smart_status || "N/A"}</p>
+                        </div>
+                        {disk.serial && disk.serial !== "Unknown" && (
+                          <div className="col-span-2">
+                            <span className="text-muted-foreground">Serial</span>
+                            <p className="font-medium text-xs truncate">{disk.serial}</p>
+                          </div>
+                        )}
+                      </div>
+                      
+                      {/* Observations badge if any */}
+                      {(disk.observations_count ?? 0) > 0 && (
+                        <div className="pl-7">
+                          <Badge className="bg-blue-500/10 text-blue-400 border-blue-500/20 gap-1 text-[10px] px-1.5 py-0">
+                            <Info className="h-3 w-3" />
+                            {disk.observations_count} observation{disk.observations_count > 1 ? 's' : ''}
+                          </Badge>
+                        </div>
+                      )}
                    </div>
                  </div>

@@ -2554,6 +2554,55 @@ def get_smart_data(disk_name):
        import traceback
        traceback.print_exc()
    
+    # ── Integrate persistent worst_health ──
+    # The health should never improve from a previous worst state without admin intervention.
+    # This prevents disks from showing "healthy" after they had issues that may have auto-resolved.
+    try:
+        current_health = smart_data['health']
+        serial = smart_data.get('serial', '')
+        
+        # Get persistent worst_health
+        worst_info = health_persistence.get_disk_worst_health(disk_name, serial if serial != 'Unknown' else None)
+        
+        if worst_info:
+            worst_health = worst_info.get('worst_health', 'healthy')
+            admin_cleared = worst_info.get('admin_cleared', False)
+            
+            # Only apply worst_health if not cleared by admin
+            if not admin_cleared:
+                severity_order = {'unknown': -1, 'healthy': 0, 'warning': 1, 'critical': 2}
+                current_severity = severity_order.get(current_health, 0)
+                worst_severity = severity_order.get(worst_health, 0)
+                
+                # If worst_health is worse than current, use worst_health
+                if worst_severity > current_severity:
+                    smart_data['health'] = worst_health
+                    smart_data['health_source'] = 'persistent'
+                    smart_data['worst_health_date'] = worst_info.get('worst_health_date')
+                    smart_data['worst_health_reason'] = worst_info.get('worst_health_reason', '')
+        
+        # Update worst_health if current is worse (and not already stored)
+        if current_health in ('warning', 'critical'):
+            health_reason = ''
+            if smart_data.get('pending_sectors', 0) > 0:
+                health_reason = f"{smart_data['pending_sectors']} pending sector(s)"
+            if smart_data.get('reallocated_sectors', 0) > 0:
+                if health_reason:
+                    health_reason += f", {smart_data['reallocated_sectors']} reallocated"
+                else:
+                    health_reason = f"{smart_data['reallocated_sectors']} reallocated sector(s)"
+            if smart_data.get('smart_status') == 'failed':
+                health_reason = 'SMART test FAILED' + (f' ({health_reason})' if health_reason else '')
+            
+            health_persistence.update_disk_worst_health(
+                disk_name, 
+                serial if serial != 'Unknown' else None,
+                current_health,
+                health_reason
+            )
+    except Exception as e:
+        # print(f"[v0] Error integrating worst_health: {e}")
+        pass

    return smart_data

@@ -1105,8 +1105,55 @@ class HealthMonitor:
        if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks:
            checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
        
+        # Get physical disks list for UI display
+        physical_disks = self._get_physical_disks_list()
+        
+        # Add individual disk checks for UI display (like Network interfaces)
+        for disk in physical_disks:
+            device = disk.get('device', '')
+            name = disk.get('name', '')
+            serial = disk.get('serial', '')
+            final_health = disk.get('final_health', 'healthy')
+            final_reason = disk.get('final_reason', '')
+            is_usb = disk.get('is_usb', False)
+            
+            # Format check key - use device path for uniqueness
+            check_key = device.lower().replace('/', '_')  # e.g., _dev_sda
+            
+            # Determine status
+            if final_health == 'critical':
+                status = 'CRITICAL'
+            elif final_health == 'warning':
+                status = 'WARNING'
+            else:
+                status = 'OK'
+            
+            # Build detail string
+            disk_type = 'USB' if is_usb else ('NVMe' if disk.get('is_nvme') else 'SATA')
+            detail = f'{serial}' if serial else 'Unknown serial'
+            if final_reason:
+                detail += f' - {final_reason}'
+            
+            # Only add to checks if not already present (avoid duplicating error entries)
+            if check_key not in checks:
+                checks[check_key] = {
+                    'status': status,
+                    'detail': detail,
+                    'device': device,
+                    'serial': serial,
+                    'disk_type': disk_type,
+                    'is_disk_entry': True,  # Flag to identify disk entries in frontend
+                    'worst_health': disk.get('worst_health', 'healthy'),
+                    'worst_health_date': disk.get('worst_health_date'),
+                    'admin_cleared': disk.get('admin_cleared', False),
+                }
+                
+                # If disk has issues, it needs an error_key for dismiss functionality
+                if status != 'OK':
+                    checks[check_key]['error_key'] = f'disk_{name}_{serial}' if serial else f'disk_{name}'
+        
        if not issues:
-            return {'status': 'OK', 'checks': checks}
+            return {'status': 'OK', 'checks': checks, 'physical_disks': physical_disks}
        
        # ── Mark dismissed checks ──
        # If an error_key in a check has been acknowledged (dismissed) in the
@@ -1138,6 +1185,7 @@ class HealthMonitor:
                    'reason': '; '.join(issues[:3]),
                    'details': storage_details,
                    'checks': checks,
+                    'physical_disks': physical_disks,
                    'all_dismissed': True,
                }
        except Exception:
@@ -1152,7 +1200,8 @@ class HealthMonitor:
            'status': 'CRITICAL' if has_critical else 'WARNING',
            'reason': '; '.join(issues[:3]),
            'details': storage_details,
-            'checks': checks
+            'checks': checks,
+            'physical_disks': physical_disks
        }
    
    def _check_filesystem(self, mount_point: str) -> Dict[str, Any]:
@@ -1235,10 +1284,222 @@ class HealthMonitor:
                else:
                    return {'status': 'OK'} # No VGs found, LVM not in use
            
-            return {'status': 'OK', 'volumes': len(volumes)}
+        return {'status': 'OK', 'volumes': len(volumes)}
+    
+    except Exception:
+        return {'status': 'OK'}
+
+    def _get_physical_disks_list(self) -> List[Dict[str, Any]]:
+        """Get list of all physical disks with their health status.
+        
+        Combines real-time SMART data with persistent worst_health state.
+        Returns list suitable for display in Health Monitor UI.
+        """
+        disks = []
+        
+        try:
+            # Get all block devices
+            result = subprocess.run(
+                ['lsblk', '-d', '-n', '-o', 'NAME,SIZE,TYPE,TRAN,MODEL,SERIAL'],
+                capture_output=True, text=True, timeout=5
+            )
            
-        except Exception:
-            return {'status': 'OK'}
+            if result.returncode != 0:
+                return []
+            
+            for line in result.stdout.strip().split('\n'):
+                if not line.strip():
+                    continue
+                
+                parts = line.split(None, 5)
+                if len(parts) < 3:
+                    continue
+                
+                name = parts[0]
+                size = parts[1] if len(parts) > 1 else ''
+                dtype = parts[2] if len(parts) > 2 else ''
+                transport = parts[3] if len(parts) > 3 else ''
+                model = parts[4] if len(parts) > 4 else ''
+                serial = parts[5] if len(parts) > 5 else ''
+                
+                # Only include disk type devices
+                if dtype != 'disk':
+                    continue
+                
+                # Skip loop devices, ram disks, etc.
+                if name.startswith(('loop', 'ram', 'zram')):
+                    continue
+                
+                is_usb = transport.lower() == 'usb'
+                is_nvme = name.startswith('nvme')
+                
+                # Get current SMART status
+                current_health = 'healthy'
+                smart_status = 'UNKNOWN'
+                pending_sectors = 0
+                reallocated_sectors = 0
+                
+                try:
+                    dev_path = f'/dev/{name}'
+                    smart_result = subprocess.run(
+                        ['smartctl', '-H', '-A', dev_path],
+                        capture_output=True, text=True, timeout=5
+                    )
+                    
+                    output = smart_result.stdout
+                    
+                    # Check SMART overall status
+                    if 'PASSED' in output:
+                        smart_status = 'PASSED'
+                    elif 'FAILED' in output:
+                        smart_status = 'FAILED'
+                        current_health = 'critical'
+                    
+                    # Parse SMART attributes for pending/reallocated sectors
+                    for attr_line in output.split('\n'):
+                        if 'Current_Pending_Sector' in attr_line or 'Pending_Sector' in attr_line:
+                            parts_attr = attr_line.split()
+                            if parts_attr:
+                                try:
+                                    pending_sectors = int(parts_attr[-1])
+                                except ValueError:
+                                    pass
+                        elif 'Reallocated_Sector' in attr_line:
+                            parts_attr = attr_line.split()
+                            if parts_attr:
+                                try:
+                                    reallocated_sectors = int(parts_attr[-1])
+                                except ValueError:
+                                    pass
+                    
+                    # Determine current health based on sectors
+                    if current_health != 'critical':
+                        if pending_sectors > 10 or reallocated_sectors > 10:
+                            current_health = 'critical'
+                        elif pending_sectors > 0 or reallocated_sectors > 0:
+                            current_health = 'warning'
+                
+                except Exception:
+                    pass
+                
+                # Build health reason
+                health_reason = ''
+                if pending_sectors > 0:
+                    health_reason = f'{pending_sectors} pending sector(s)'
+                if reallocated_sectors > 0:
+                    if health_reason:
+                        health_reason += f', {reallocated_sectors} reallocated'
+                    else:
+                        health_reason = f'{reallocated_sectors} reallocated sector(s)'
+                if smart_status == 'FAILED':
+                    health_reason = 'SMART test FAILED' + (f' ({health_reason})' if health_reason else '')
+                
+                # Get persistent worst_health from database
+                worst_info = health_persistence.get_disk_worst_health(name, serial)
+                worst_health = worst_info.get('worst_health', 'healthy') if worst_info else 'healthy'
+                worst_health_date = worst_info.get('worst_health_date') if worst_info else None
+                worst_health_reason = worst_info.get('worst_health_reason', '') if worst_info else ''
+                admin_cleared = worst_info.get('admin_cleared', False) if worst_info else False
+                
+                # Update worst_health if current is worse
+                if current_health != 'healthy':
+                    updated = health_persistence.update_disk_worst_health(
+                        name, serial, current_health, health_reason
+                    )
+                    if updated:
+                        worst_health = current_health
+                        worst_health_reason = health_reason
+                    
+                    # Record as disk observation (for both internal and USB disks)
+                    # This ensures SMART issues are tracked in observations
+                    try:
+                        obs_type = 'smart_error'
+                        if pending_sectors and pending_sectors > 0:
+                            obs_type = 'pending_sectors'
+                        elif reallocated_sectors and reallocated_sectors > 0:
+                            obs_type = 'reallocated_sectors'
+                        elif smart_status == 'FAILED':
+                            obs_type = 'smart_failed'
+                        
+                        obs_sig = f'smart_{name}_{obs_type}_{pending_sectors}_{reallocated_sectors}'
+                        health_persistence.record_disk_observation(
+                            device_name=name,
+                            serial=serial,
+                            error_type=obs_type,
+                            error_signature=obs_sig,
+                            raw_message=f'/dev/{name}: {health_reason}',
+                            severity=current_health,
+                        )
+                        
+                        # Send smart_warning notification if this is a NEW issue
+                        # (only when updated=True means this is first time seeing this state)
+                        if updated:
+                            try:
+                                from notification_manager import notification_manager
+                                notification_manager.send_notification(
+                                    event_type='smart_warning',
+                                    data={
+                                        'device': f'/dev/{name}',
+                                        'reason': health_reason,
+                                        'serial': serial or 'Unknown',
+                                        'model': model or 'Unknown',
+                                        'pending_sectors': pending_sectors,
+                                        'reallocated_sectors': reallocated_sectors,
+                                        'smart_status': smart_status,
+                                        'hostname': self._hostname,
+                                    }
+                                )
+                            except Exception:
+                                pass
+                    except Exception:
+                        pass
+                
+                # Final health is the worse of current and persistent
+                severity_order = {'healthy': 0, 'warning': 1, 'critical': 2}
+                if severity_order.get(worst_health, 0) > severity_order.get(current_health, 0):
+                    final_health = worst_health
+                    final_reason = worst_health_reason
+                else:
+                    final_health = current_health
+                    final_reason = health_reason
+                
+                # Get active observations count
+                obs = health_persistence.get_disk_observations(device_name=name, serial=serial)
+                active_observations = len(obs) if obs else 0
+                
+                # Register disk in persistence (for tracking)
+                try:
+                    health_persistence.register_disk(name, serial, model)
+                except Exception:
+                    pass
+                
+                disks.append({
+                    'device': f'/dev/{name}',
+                    'name': name,
+                    'serial': serial or '',
+                    'model': model or 'Unknown',
+                    'size': size,
+                    'transport': transport,
+                    'is_usb': is_usb,
+                    'is_nvme': is_nvme,
+                    'smart_status': smart_status,
+                    'current_health': current_health,
+                    'current_health_reason': health_reason,
+                    'worst_health': worst_health,
+                    'worst_health_date': worst_health_date,
+                    'worst_health_reason': worst_health_reason,
+                    'final_health': final_health,
+                    'final_reason': final_reason,
+                    'pending_sectors': pending_sectors,
+                    'reallocated_sectors': reallocated_sectors,
+                    'active_observations': active_observations,
+                    'admin_cleared': admin_cleared,
+                })
+        
+        except Exception as e:
+            print(f"[HealthMonitor] Error getting physical disks list: {e}")
+        
+        return disks
    
    # This function is no longer used in get_detailed_status, but kept for reference if needed.
    # The new _check_proxmox_storage function handles this logic better.
@@ -162,6 +162,12 @@ class HealthPersistence:
                first_seen TEXT NOT NULL,
                last_seen TEXT NOT NULL,
                removed INTEGER DEFAULT 0,
+                worst_health TEXT DEFAULT 'healthy',
+                worst_health_date TEXT,
+                worst_health_reason TEXT,
+                admin_cleared INTEGER DEFAULT 0,
+                admin_cleared_date TEXT,
+                admin_cleared_note TEXT,
                UNIQUE(device_name, serial)
            )
        ''')
@@ -189,6 +195,17 @@ class HealthPersistence:
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_disk ON disk_observations(disk_registry_id)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_dismissed ON disk_observations(dismissed)')
        
+        # Migration: add worst_health columns to disk_registry if not present
+        cursor.execute("PRAGMA table_info(disk_registry)")
+        disk_columns = [col[1] for col in cursor.fetchall()]
+        if 'worst_health' not in disk_columns:
+            cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health TEXT DEFAULT 'healthy'")
+            cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health_date TEXT")
+            cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health_reason TEXT")
+            cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared INTEGER DEFAULT 0")
+            cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared_date TEXT")
+            cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared_note TEXT")
+        
        conn.commit()
        conn.close()
    
@@ -1476,6 +1493,186 @@ class HealthPersistence:
        except Exception as e:
            print(f"[HealthPersistence] Error marking removed disks: {e}")

+    # ────────────────────────────────────────────────────────────────
+    #  Disk Worst Health State Tracking
+    # ────────────────────────────────────────────────────────────────
+    
+    HEALTH_SEVERITY_ORDER = {'healthy': 0, 'warning': 1, 'critical': 2}
+    
+    def update_disk_worst_health(self, device_name: str, serial: Optional[str],
+                                  health: str, reason: str = '') -> bool:
+        """Update worst_health if the new health is worse than current.
+        
+        Health progression is one-way: healthy -> warning -> critical
+        Only admin_clear_disk_health() can reset to healthy.
+        
+        Returns True if worst_health was updated.
+        """
+        health_lower = health.lower()
+        if health_lower not in self.HEALTH_SEVERITY_ORDER:
+            return False
+        
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
+            if not disk_id:
+                # Auto-register disk if not present
+                self.register_disk(device_name.replace('/dev/', ''), serial)
+                disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
+            
+            if not disk_id:
+                conn.close()
+                return False
+            
+            # Get current worst_health
+            cursor.execute('SELECT worst_health, admin_cleared FROM disk_registry WHERE id = ?', (disk_id,))
+            row = cursor.fetchone()
+            if not row:
+                conn.close()
+                return False
+            
+            current_worst = row[0] or 'healthy'
+            admin_cleared = row[1] or 0
+            
+            # If admin cleared and new issue is the same or less severe, don't update
+            # But if admin cleared and issue escalates, update anyway
+            current_severity = self.HEALTH_SEVERITY_ORDER.get(current_worst, 0)
+            new_severity = self.HEALTH_SEVERITY_ORDER.get(health_lower, 0)
+            
+            # Only update if new health is worse
+            if new_severity > current_severity:
+                now = datetime.now().isoformat()
+                cursor.execute('''
+                    UPDATE disk_registry 
+                    SET worst_health = ?, worst_health_date = ?, worst_health_reason = ?,
+                        admin_cleared = 0
+                    WHERE id = ?
+                ''', (health_lower, now, reason, disk_id))
+                conn.commit()
+                conn.close()
+                return True
+            
+            conn.close()
+            return False
+        except Exception as e:
+            print(f"[HealthPersistence] Error updating disk worst_health: {e}")
+            return False
+    
+    def get_disk_worst_health(self, device_name: str, serial: Optional[str] = None) -> Optional[Dict[str, Any]]:
+        """Get the worst health state for a specific disk."""
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
+            if not disk_id:
+                conn.close()
+                return None
+            
+            cursor.execute('''
+                SELECT worst_health, worst_health_date, worst_health_reason, 
+                       admin_cleared, admin_cleared_date, admin_cleared_note
+                FROM disk_registry WHERE id = ?
+            ''', (disk_id,))
+            row = cursor.fetchone()
+            conn.close()
+            
+            if row:
+                return {
+                    'worst_health': row[0] or 'healthy',
+                    'worst_health_date': row[1],
+                    'worst_health_reason': row[2],
+                    'admin_cleared': bool(row[3]),
+                    'admin_cleared_date': row[4],
+                    'admin_cleared_note': row[5],
+                }
+            return None
+        except Exception as e:
+            print(f"[HealthPersistence] Error getting disk worst_health: {e}")
+            return None
+    
+    def admin_clear_disk_health(self, device_name: str, serial: Optional[str], note: str) -> bool:
+        """Admin manually clears disk health history (e.g., after disk replacement).
+        
+        Requires a note explaining why (for audit trail).
+        """
+        if not note or len(note.strip()) < 5:
+            return False  # Require meaningful note
+        
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
+            if not disk_id:
+                conn.close()
+                return False
+            
+            now = datetime.now().isoformat()
+            cursor.execute('''
+                UPDATE disk_registry 
+                SET worst_health = 'healthy', admin_cleared = 1, 
+                    admin_cleared_date = ?, admin_cleared_note = ?
+                WHERE id = ?
+            ''', (now, note.strip(), disk_id))
+            
+            # Also dismiss all active observations for this disk
+            cursor.execute('''
+                UPDATE disk_observations SET dismissed = 1 WHERE disk_registry_id = ?
+            ''', (disk_id,))
+            
+            conn.commit()
+            conn.close()
+            return True
+        except Exception as e:
+            print(f"[HealthPersistence] Error clearing disk health: {e}")
+            return False
+    
+    def get_all_disks_health_summary(self) -> List[Dict[str, Any]]:
+        """Get health summary for all registered disks (for Health Monitor listing).
+        
+        Returns list of disks with their current and worst health states.
+        """
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            cursor.execute('''
+                SELECT d.id, d.device_name, d.serial, d.model, d.size_bytes,
+                       d.first_seen, d.last_seen, d.removed,
+                       d.worst_health, d.worst_health_date, d.worst_health_reason,
+                       d.admin_cleared, d.admin_cleared_date,
+                       (SELECT COUNT(*) FROM disk_observations o 
+                        WHERE o.disk_registry_id = d.id AND o.dismissed = 0) as active_observations
+                FROM disk_registry d
+                WHERE d.removed = 0
+                ORDER BY d.device_name
+            ''')
+            rows = cursor.fetchall()
+            conn.close()
+            
+            return [{
+                'id': r[0],
+                'device_name': r[1],
+                'serial': r[2] or '',
+                'model': r[3] or 'Unknown',
+                'size_bytes': r[4],
+                'first_seen': r[5],
+                'last_seen': r[6],
+                'removed': bool(r[7]),
+                'worst_health': r[8] or 'healthy',
+                'worst_health_date': r[9],
+                'worst_health_reason': r[10] or '',
+                'admin_cleared': bool(r[11]),
+                'admin_cleared_date': r[12],
+                'active_observations': r[13],
+            } for r in rows]
+        except Exception as e:
+            print(f"[HealthPersistence] Error getting disks health summary: {e}")
+            return []
+

 # Global instance
 health_persistence = HealthPersistence()
@@ -559,6 +559,13 @@ TEMPLATES = {
        'group': 'storage',
        'default_enabled': True,
    },
+    'smart_warning': {
+        'title': '{hostname}: SMART warning on {device}',
+        'body': '{device}: {reason}',
+        'label': 'SMART warning (sectors)',
+        'group': 'storage',
+        'default_enabled': True,
+    },
    'storage_unavailable': {
        'title': '{hostname}: Storage unavailable - {storage_name}',
        'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',