- {disk.model && disk.model !== "Unknown" && (
-
{disk.model}
- )}
-
+
+ {/* Header row */}
+
+
+
+
/dev/{disk.name}
+ USB
+
+
{disk.temperature > 0 && (
-
-
+
+
{disk.temperature}°C
)}
{getHealthBadge(disk.health)}
- {(disk.observations_count ?? 0) > 0 && (
-
-
- {disk.observations_count}
-
- )}
+
+ {/* Model if available */}
+ {disk.model && disk.model !== "Unknown" && (
+
{disk.model}
+ )}
+
+ {/* Info grid - 2 columns */}
+
+
+
Size
+
{disk.size || "N/A"}
+
+
+
SMART Status
+
{disk.smart_status || "N/A"}
+
+ {disk.serial && disk.serial !== "Unknown" && (
+
+
Serial
+
{disk.serial}
+
+ )}
+
+
+ {/* Observations badge if any */}
+ {(disk.observations_count ?? 0) > 0 && (
+
+
+
+ {disk.observations_count} observation{disk.observations_count > 1 ? 's' : ''}
+
+
+ )}
diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py
index 90579fe4..a144e60c 100644
--- a/AppImage/scripts/flask_server.py
+++ b/AppImage/scripts/flask_server.py
@@ -2554,6 +2554,55 @@ def get_smart_data(disk_name):
import traceback
traceback.print_exc()
+ # ── Integrate persistent worst_health ──
+ # The health should never improve from a previous worst state without admin intervention.
+ # This prevents disks from showing "healthy" after they had issues that may have auto-resolved.
+ try:
+ current_health = smart_data['health']
+ serial = smart_data.get('serial', '')
+
+ # Get persistent worst_health
+ worst_info = health_persistence.get_disk_worst_health(disk_name, serial if serial != 'Unknown' else None)
+
+ if worst_info:
+ worst_health = worst_info.get('worst_health', 'healthy')
+ admin_cleared = worst_info.get('admin_cleared', False)
+
+ # Only apply worst_health if not cleared by admin
+ if not admin_cleared:
+ severity_order = {'unknown': -1, 'healthy': 0, 'warning': 1, 'critical': 2}
+ current_severity = severity_order.get(current_health, 0)
+ worst_severity = severity_order.get(worst_health, 0)
+
+ # If worst_health is worse than current, use worst_health
+ if worst_severity > current_severity:
+ smart_data['health'] = worst_health
+ smart_data['health_source'] = 'persistent'
+ smart_data['worst_health_date'] = worst_info.get('worst_health_date')
+ smart_data['worst_health_reason'] = worst_info.get('worst_health_reason', '')
+
+ # Update worst_health if current is worse (and not already stored)
+ if current_health in ('warning', 'critical'):
+ health_reason = ''
+ if smart_data.get('pending_sectors', 0) > 0:
+ health_reason = f"{smart_data['pending_sectors']} pending sector(s)"
+ if smart_data.get('reallocated_sectors', 0) > 0:
+ if health_reason:
+ health_reason += f", {smart_data['reallocated_sectors']} reallocated"
+ else:
+ health_reason = f"{smart_data['reallocated_sectors']} reallocated sector(s)"
+ if smart_data.get('smart_status') == 'failed':
+ health_reason = 'SMART test FAILED' + (f' ({health_reason})' if health_reason else '')
+
+ health_persistence.update_disk_worst_health(
+ disk_name,
+ serial if serial != 'Unknown' else None,
+ current_health,
+ health_reason
+ )
+ except Exception as e:
+ # print(f"[v0] Error integrating worst_health: {e}")
+ pass
return smart_data
diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py
index 84167191..d3bbca6b 100644
--- a/AppImage/scripts/health_monitor.py
+++ b/AppImage/scripts/health_monitor.py
@@ -1105,8 +1105,55 @@ class HealthMonitor:
if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks:
checks['lvm_volumes'] = {'status': 'OK', 'detail': 'LVM volumes OK'}
+ # Get physical disks list for UI display
+ physical_disks = self._get_physical_disks_list()
+
+ # Add individual disk checks for UI display (like Network interfaces)
+ for disk in physical_disks:
+ device = disk.get('device', '')
+ name = disk.get('name', '')
+ serial = disk.get('serial', '')
+ final_health = disk.get('final_health', 'healthy')
+ final_reason = disk.get('final_reason', '')
+ is_usb = disk.get('is_usb', False)
+
+ # Format check key - use device path for uniqueness
+ check_key = device.lower().replace('/', '_') # e.g., _dev_sda
+
+ # Determine status
+ if final_health == 'critical':
+ status = 'CRITICAL'
+ elif final_health == 'warning':
+ status = 'WARNING'
+ else:
+ status = 'OK'
+
+ # Build detail string
+ disk_type = 'USB' if is_usb else ('NVMe' if disk.get('is_nvme') else 'SATA')
+ detail = f'{serial}' if serial else 'Unknown serial'
+ if final_reason:
+ detail += f' - {final_reason}'
+
+ # Only add to checks if not already present (avoid duplicating error entries)
+ if check_key not in checks:
+ checks[check_key] = {
+ 'status': status,
+ 'detail': detail,
+ 'device': device,
+ 'serial': serial,
+ 'disk_type': disk_type,
+ 'is_disk_entry': True, # Flag to identify disk entries in frontend
+ 'worst_health': disk.get('worst_health', 'healthy'),
+ 'worst_health_date': disk.get('worst_health_date'),
+ 'admin_cleared': disk.get('admin_cleared', False),
+ }
+
+ # If disk has issues, it needs an error_key for dismiss functionality
+ if status != 'OK':
+ checks[check_key]['error_key'] = f'disk_{name}_{serial}' if serial else f'disk_{name}'
+
if not issues:
- return {'status': 'OK', 'checks': checks}
+ return {'status': 'OK', 'checks': checks, 'physical_disks': physical_disks}
# ── Mark dismissed checks ──
# If an error_key in a check has been acknowledged (dismissed) in the
@@ -1138,6 +1185,7 @@ class HealthMonitor:
'reason': '; '.join(issues[:3]),
'details': storage_details,
'checks': checks,
+ 'physical_disks': physical_disks,
'all_dismissed': True,
}
except Exception:
@@ -1152,7 +1200,8 @@ class HealthMonitor:
'status': 'CRITICAL' if has_critical else 'WARNING',
'reason': '; '.join(issues[:3]),
'details': storage_details,
- 'checks': checks
+ 'checks': checks,
+ 'physical_disks': physical_disks
}
def _check_filesystem(self, mount_point: str) -> Dict[str, Any]:
@@ -1235,10 +1284,222 @@ class HealthMonitor:
else:
return {'status': 'OK'} # No VGs found, LVM not in use
- return {'status': 'OK', 'volumes': len(volumes)}
+ return {'status': 'OK', 'volumes': len(volumes)}
+
+ except Exception:
+ return {'status': 'OK'}
+
+ def _get_physical_disks_list(self) -> List[Dict[str, Any]]:
+ """Get list of all physical disks with their health status.
+
+ Combines real-time SMART data with persistent worst_health state.
+ Returns list suitable for display in Health Monitor UI.
+ """
+ disks = []
+
+ try:
+ # Get all block devices
+ result = subprocess.run(
+ ['lsblk', '-d', '-n', '-o', 'NAME,SIZE,TYPE,TRAN,MODEL,SERIAL'],
+ capture_output=True, text=True, timeout=5
+ )
- except Exception:
- return {'status': 'OK'}
+ if result.returncode != 0:
+ return []
+
+ for line in result.stdout.strip().split('\n'):
+ if not line.strip():
+ continue
+
+ parts = line.split(None, 5)
+ if len(parts) < 3:
+ continue
+
+ name = parts[0]
+ size = parts[1] if len(parts) > 1 else ''
+ dtype = parts[2] if len(parts) > 2 else ''
+ transport = parts[3] if len(parts) > 3 else ''
+ model = parts[4] if len(parts) > 4 else ''
+ serial = parts[5] if len(parts) > 5 else ''
+
+ # Only include disk type devices
+ if dtype != 'disk':
+ continue
+
+ # Skip loop devices, ram disks, etc.
+ if name.startswith(('loop', 'ram', 'zram')):
+ continue
+
+ is_usb = transport.lower() == 'usb'
+ is_nvme = name.startswith('nvme')
+
+ # Get current SMART status
+ current_health = 'healthy'
+ smart_status = 'UNKNOWN'
+ pending_sectors = 0
+ reallocated_sectors = 0
+
+ try:
+ dev_path = f'/dev/{name}'
+ smart_result = subprocess.run(
+ ['smartctl', '-H', '-A', dev_path],
+ capture_output=True, text=True, timeout=5
+ )
+
+ output = smart_result.stdout
+
+ # Check SMART overall status
+ if 'PASSED' in output:
+ smart_status = 'PASSED'
+ elif 'FAILED' in output:
+ smart_status = 'FAILED'
+ current_health = 'critical'
+
+ # Parse SMART attributes for pending/reallocated sectors
+ for attr_line in output.split('\n'):
+ if 'Current_Pending_Sector' in attr_line or 'Pending_Sector' in attr_line:
+ parts_attr = attr_line.split()
+ if parts_attr:
+ try:
+ pending_sectors = int(parts_attr[-1])
+ except ValueError:
+ pass
+ elif 'Reallocated_Sector' in attr_line:
+ parts_attr = attr_line.split()
+ if parts_attr:
+ try:
+ reallocated_sectors = int(parts_attr[-1])
+ except ValueError:
+ pass
+
+ # Determine current health based on sectors
+ if current_health != 'critical':
+ if pending_sectors > 10 or reallocated_sectors > 10:
+ current_health = 'critical'
+ elif pending_sectors > 0 or reallocated_sectors > 0:
+ current_health = 'warning'
+
+ except Exception:
+ pass
+
+ # Build health reason
+ health_reason = ''
+ if pending_sectors > 0:
+ health_reason = f'{pending_sectors} pending sector(s)'
+ if reallocated_sectors > 0:
+ if health_reason:
+ health_reason += f', {reallocated_sectors} reallocated'
+ else:
+ health_reason = f'{reallocated_sectors} reallocated sector(s)'
+ if smart_status == 'FAILED':
+ health_reason = 'SMART test FAILED' + (f' ({health_reason})' if health_reason else '')
+
+ # Get persistent worst_health from database
+ worst_info = health_persistence.get_disk_worst_health(name, serial)
+ worst_health = worst_info.get('worst_health', 'healthy') if worst_info else 'healthy'
+ worst_health_date = worst_info.get('worst_health_date') if worst_info else None
+ worst_health_reason = worst_info.get('worst_health_reason', '') if worst_info else ''
+ admin_cleared = worst_info.get('admin_cleared', False) if worst_info else False
+
+ # Update worst_health if current is worse
+ if current_health != 'healthy':
+ updated = health_persistence.update_disk_worst_health(
+ name, serial, current_health, health_reason
+ )
+ if updated:
+ worst_health = current_health
+ worst_health_reason = health_reason
+
+ # Record as disk observation (for both internal and USB disks)
+ # This ensures SMART issues are tracked in observations
+ try:
+ obs_type = 'smart_error'
+ if pending_sectors and pending_sectors > 0:
+ obs_type = 'pending_sectors'
+ elif reallocated_sectors and reallocated_sectors > 0:
+ obs_type = 'reallocated_sectors'
+ elif smart_status == 'FAILED':
+ obs_type = 'smart_failed'
+
+ obs_sig = f'smart_{name}_{obs_type}_{pending_sectors}_{reallocated_sectors}'
+ health_persistence.record_disk_observation(
+ device_name=name,
+ serial=serial,
+ error_type=obs_type,
+ error_signature=obs_sig,
+ raw_message=f'/dev/{name}: {health_reason}',
+ severity=current_health,
+ )
+
+ # Send smart_warning notification if this is a NEW issue
+ # (only when updated=True means this is first time seeing this state)
+ if updated:
+ try:
+ from notification_manager import notification_manager
+ notification_manager.send_notification(
+ event_type='smart_warning',
+ data={
+ 'device': f'/dev/{name}',
+ 'reason': health_reason,
+ 'serial': serial or 'Unknown',
+ 'model': model or 'Unknown',
+ 'pending_sectors': pending_sectors,
+ 'reallocated_sectors': reallocated_sectors,
+ 'smart_status': smart_status,
+ 'hostname': self._hostname,
+ }
+ )
+ except Exception:
+ pass
+ except Exception:
+ pass
+
+ # Final health is the worse of current and persistent
+ severity_order = {'healthy': 0, 'warning': 1, 'critical': 2}
+ if severity_order.get(worst_health, 0) > severity_order.get(current_health, 0):
+ final_health = worst_health
+ final_reason = worst_health_reason
+ else:
+ final_health = current_health
+ final_reason = health_reason
+
+ # Get active observations count
+ obs = health_persistence.get_disk_observations(device_name=name, serial=serial)
+ active_observations = len(obs) if obs else 0
+
+ # Register disk in persistence (for tracking)
+ try:
+ health_persistence.register_disk(name, serial, model)
+ except Exception:
+ pass
+
+ disks.append({
+ 'device': f'/dev/{name}',
+ 'name': name,
+ 'serial': serial or '',
+ 'model': model or 'Unknown',
+ 'size': size,
+ 'transport': transport,
+ 'is_usb': is_usb,
+ 'is_nvme': is_nvme,
+ 'smart_status': smart_status,
+ 'current_health': current_health,
+ 'current_health_reason': health_reason,
+ 'worst_health': worst_health,
+ 'worst_health_date': worst_health_date,
+ 'worst_health_reason': worst_health_reason,
+ 'final_health': final_health,
+ 'final_reason': final_reason,
+ 'pending_sectors': pending_sectors,
+ 'reallocated_sectors': reallocated_sectors,
+ 'active_observations': active_observations,
+ 'admin_cleared': admin_cleared,
+ })
+
+ except Exception as e:
+ print(f"[HealthMonitor] Error getting physical disks list: {e}")
+
+ return disks
# This function is no longer used in get_detailed_status, but kept for reference if needed.
# The new _check_proxmox_storage function handles this logic better.
diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py
index 423d8316..40d45dcf 100644
--- a/AppImage/scripts/health_persistence.py
+++ b/AppImage/scripts/health_persistence.py
@@ -162,6 +162,12 @@ class HealthPersistence:
first_seen TEXT NOT NULL,
last_seen TEXT NOT NULL,
removed INTEGER DEFAULT 0,
+ worst_health TEXT DEFAULT 'healthy',
+ worst_health_date TEXT,
+ worst_health_reason TEXT,
+ admin_cleared INTEGER DEFAULT 0,
+ admin_cleared_date TEXT,
+ admin_cleared_note TEXT,
UNIQUE(device_name, serial)
)
''')
@@ -189,6 +195,17 @@ class HealthPersistence:
cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_disk ON disk_observations(disk_registry_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_dismissed ON disk_observations(dismissed)')
+ # Migration: add worst_health columns to disk_registry if not present
+ cursor.execute("PRAGMA table_info(disk_registry)")
+ disk_columns = [col[1] for col in cursor.fetchall()]
+ if 'worst_health' not in disk_columns:
+ cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health TEXT DEFAULT 'healthy'")
+ cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health_date TEXT")
+ cursor.execute("ALTER TABLE disk_registry ADD COLUMN worst_health_reason TEXT")
+ cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared INTEGER DEFAULT 0")
+ cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared_date TEXT")
+ cursor.execute("ALTER TABLE disk_registry ADD COLUMN admin_cleared_note TEXT")
+
conn.commit()
conn.close()
@@ -1476,6 +1493,186 @@ class HealthPersistence:
except Exception as e:
print(f"[HealthPersistence] Error marking removed disks: {e}")
+ # ────────────────────────────────────────────────────────────────
+ # Disk Worst Health State Tracking
+ # ────────────────────────────────────────────────────────────────
+
+ HEALTH_SEVERITY_ORDER = {'healthy': 0, 'warning': 1, 'critical': 2}
+
+ def update_disk_worst_health(self, device_name: str, serial: Optional[str],
+ health: str, reason: str = '') -> bool:
+ """Update worst_health if the new health is worse than current.
+
+ Health progression is one-way: healthy -> warning -> critical
+ Only admin_clear_disk_health() can reset to healthy.
+
+ Returns True if worst_health was updated.
+ """
+ health_lower = health.lower()
+ if health_lower not in self.HEALTH_SEVERITY_ORDER:
+ return False
+
+ try:
+ conn = self._get_conn()
+ cursor = conn.cursor()
+
+ disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
+ if not disk_id:
+ # Auto-register disk if not present
+ self.register_disk(device_name.replace('/dev/', ''), serial)
+ disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
+
+ if not disk_id:
+ conn.close()
+ return False
+
+ # Get current worst_health
+ cursor.execute('SELECT worst_health, admin_cleared FROM disk_registry WHERE id = ?', (disk_id,))
+ row = cursor.fetchone()
+ if not row:
+ conn.close()
+ return False
+
+ current_worst = row[0] or 'healthy'
+ admin_cleared = row[1] or 0
+
+ # If admin cleared and new issue is the same or less severe, don't update
+ # But if admin cleared and issue escalates, update anyway
+ current_severity = self.HEALTH_SEVERITY_ORDER.get(current_worst, 0)
+ new_severity = self.HEALTH_SEVERITY_ORDER.get(health_lower, 0)
+
+ # Only update if new health is worse
+ if new_severity > current_severity:
+ now = datetime.now().isoformat()
+ cursor.execute('''
+ UPDATE disk_registry
+ SET worst_health = ?, worst_health_date = ?, worst_health_reason = ?,
+ admin_cleared = 0
+ WHERE id = ?
+ ''', (health_lower, now, reason, disk_id))
+ conn.commit()
+ conn.close()
+ return True
+
+ conn.close()
+ return False
+ except Exception as e:
+ print(f"[HealthPersistence] Error updating disk worst_health: {e}")
+ return False
+
+ def get_disk_worst_health(self, device_name: str, serial: Optional[str] = None) -> Optional[Dict[str, Any]]:
+ """Get the worst health state for a specific disk."""
+ try:
+ conn = self._get_conn()
+ cursor = conn.cursor()
+
+ disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
+ if not disk_id:
+ conn.close()
+ return None
+
+ cursor.execute('''
+ SELECT worst_health, worst_health_date, worst_health_reason,
+ admin_cleared, admin_cleared_date, admin_cleared_note
+ FROM disk_registry WHERE id = ?
+ ''', (disk_id,))
+ row = cursor.fetchone()
+ conn.close()
+
+ if row:
+ return {
+ 'worst_health': row[0] or 'healthy',
+ 'worst_health_date': row[1],
+ 'worst_health_reason': row[2],
+ 'admin_cleared': bool(row[3]),
+ 'admin_cleared_date': row[4],
+ 'admin_cleared_note': row[5],
+ }
+ return None
+ except Exception as e:
+ print(f"[HealthPersistence] Error getting disk worst_health: {e}")
+ return None
+
+ def admin_clear_disk_health(self, device_name: str, serial: Optional[str], note: str) -> bool:
+ """Admin manually clears disk health history (e.g., after disk replacement).
+
+ Requires a note explaining why (for audit trail).
+ """
+ if not note or len(note.strip()) < 5:
+ return False # Require meaningful note
+
+ try:
+ conn = self._get_conn()
+ cursor = conn.cursor()
+
+ disk_id = self._get_disk_registry_id(cursor, device_name.replace('/dev/', ''), serial)
+ if not disk_id:
+ conn.close()
+ return False
+
+ now = datetime.now().isoformat()
+ cursor.execute('''
+ UPDATE disk_registry
+ SET worst_health = 'healthy', admin_cleared = 1,
+ admin_cleared_date = ?, admin_cleared_note = ?
+ WHERE id = ?
+ ''', (now, note.strip(), disk_id))
+
+ # Also dismiss all active observations for this disk
+ cursor.execute('''
+ UPDATE disk_observations SET dismissed = 1 WHERE disk_registry_id = ?
+ ''', (disk_id,))
+
+ conn.commit()
+ conn.close()
+ return True
+ except Exception as e:
+ print(f"[HealthPersistence] Error clearing disk health: {e}")
+ return False
+
+ def get_all_disks_health_summary(self) -> List[Dict[str, Any]]:
+ """Get health summary for all registered disks (for Health Monitor listing).
+
+ Returns list of disks with their current and worst health states.
+ """
+ try:
+ conn = self._get_conn()
+ cursor = conn.cursor()
+
+ cursor.execute('''
+ SELECT d.id, d.device_name, d.serial, d.model, d.size_bytes,
+ d.first_seen, d.last_seen, d.removed,
+ d.worst_health, d.worst_health_date, d.worst_health_reason,
+ d.admin_cleared, d.admin_cleared_date,
+ (SELECT COUNT(*) FROM disk_observations o
+ WHERE o.disk_registry_id = d.id AND o.dismissed = 0) as active_observations
+ FROM disk_registry d
+ WHERE d.removed = 0
+ ORDER BY d.device_name
+ ''')
+ rows = cursor.fetchall()
+ conn.close()
+
+ return [{
+ 'id': r[0],
+ 'device_name': r[1],
+ 'serial': r[2] or '',
+ 'model': r[3] or 'Unknown',
+ 'size_bytes': r[4],
+ 'first_seen': r[5],
+ 'last_seen': r[6],
+ 'removed': bool(r[7]),
+ 'worst_health': r[8] or 'healthy',
+ 'worst_health_date': r[9],
+ 'worst_health_reason': r[10] or '',
+ 'admin_cleared': bool(r[11]),
+ 'admin_cleared_date': r[12],
+ 'active_observations': r[13],
+ } for r in rows]
+ except Exception as e:
+ print(f"[HealthPersistence] Error getting disks health summary: {e}")
+ return []
+
# Global instance
health_persistence = HealthPersistence()
diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py
index a67de4da..ef8306b3 100644
--- a/AppImage/scripts/notification_templates.py
+++ b/AppImage/scripts/notification_templates.py
@@ -559,6 +559,13 @@ TEMPLATES = {
'group': 'storage',
'default_enabled': True,
},
+ 'smart_warning': {
+ 'title': '{hostname}: SMART warning on {device}',
+ 'body': '{device}: {reason}',
+ 'label': 'SMART warning (sectors)',
+ 'group': 'storage',
+ 'default_enabled': True,
+ },
'storage_unavailable': {
'title': '{hostname}: Storage unavailable - {storage_name}',
'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',