mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-06-03 13:54:41 +00:00
Health Monitor: reconcile stale disk warnings across reboots
When a host gets transient I/O events on a disk while smartctl is
momentarily unavailable (the canonical case: late in a noisy
shutdown), the disk-scan code records a `disk_<name>` WARNING tagged
"SMART: unavailable" exactly once and trusts the next scan to clear
it. That trust is misplaced: the clear path only fires when the
device shows up in the current dmesg window with zero events. After
a reboot, dmesg is empty for that device — so the device never gets
iterated, resolve_error is never called, and the dashboard stays
orange for a disk whose SMART now reports PASSED.
Caught on a lab host where `disk_nvme2n1` had been stuck as WARNING
for hours after a reboot. SMART was 100% healthy at the moment of
inspection (Critical Warning 0x00, 0 media errors, 100% spare). The
error's first_seen and last_seen were identical and pre-dated the
current boot, confirming a one-shot record that nothing had cleared.
Fix: add a `_reconcile_stale_disk_warnings()` pass at the top of
`_check_disks_optimized()`. For every active `disk_*` error
(skipping `disk_fs_*`, which is already reconciled separately):
- device gone from /dev/ → resolve "Device no longer present"
- device present + SMART PASSED → resolve "Transient I/O cleared,
SMART now reports healthy"
- device present + SMART UNKNOWN/FAILED → leave active so the
main loop can re-classify on the next dmesg window
Acknowledged errors are left alone so the user's explicit dismiss
intent isn't overridden.
Verified end-to-end: re-injected the original `disk_nvme2n1`
warning into the persistence DB on the lab host, waited one scan
cycle, error was resolved automatically with `resolved_at` set and
`resolution_reason = 'Transient I/O cleared, SMART now reports
healthy'`.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -2361,18 +2361,102 @@ class HealthMonitor:
|
||||
except Exception:
|
||||
return fallback
|
||||
|
||||
def _reconcile_stale_disk_warnings(self) -> None:
|
||||
"""
|
||||
Reconcile persisted disk_<name> warnings against the current host
|
||||
state before each disk scan.
|
||||
|
||||
The disk-scan loop only resolves an error when the device appears
|
||||
in the current dmesg window with zero events. After a reboot,
|
||||
dmesg is empty for that device, so the loop never iterates it,
|
||||
and a `disk_<name>` WARNING recorded as "SMART: unavailable"
|
||||
during a noisy shutdown can stay active forever — the dashboard
|
||||
keeps showing an orange "Warning" badge for a disk whose SMART
|
||||
is in fact PASSED.
|
||||
|
||||
This pass walks the active disk_* errors (skipping disk_fs_*,
|
||||
which is already reconciled separately below) and:
|
||||
|
||||
- device gone from /dev/ → resolve as "Device no longer present"
|
||||
- device present + SMART now PASSED → resolve as "Transient
|
||||
I/O cleared, SMART now healthy"
|
||||
- device present + SMART still unavailable → leave warning
|
||||
active (the original condition is still ambiguous)
|
||||
- device present + SMART FAILED → leave warning active (the
|
||||
main loop will pick it up and may upgrade to CRITICAL)
|
||||
"""
|
||||
try:
|
||||
active = health_persistence.get_active_errors(category='disks')
|
||||
except Exception:
|
||||
return
|
||||
for err in active:
|
||||
err_key = err.get('error_key', '') or ''
|
||||
# Skip the filesystem-mount errors — the dedicated block
|
||||
# below handles them with its own reconciliation rules.
|
||||
if not err_key.startswith('disk_') or err_key.startswith('disk_fs_'):
|
||||
continue
|
||||
# Don't disturb errors the user explicitly acknowledged.
|
||||
if err.get('acknowledged') == 1:
|
||||
continue
|
||||
details = err.get('details', {})
|
||||
if isinstance(details, str):
|
||||
try:
|
||||
details = json.loads(details)
|
||||
except Exception:
|
||||
details = {}
|
||||
# Recover the block device name. Prefer the structured
|
||||
# `block_device` field; fall back to `disk` or derive from
|
||||
# the error_key (`disk_nvme2n1` → `nvme2n1`).
|
||||
base_disk = (
|
||||
details.get('block_device') or
|
||||
details.get('disk') or
|
||||
err_key[len('disk_'):]
|
||||
)
|
||||
if not base_disk:
|
||||
continue
|
||||
dev_path = f'/dev/{base_disk}'
|
||||
if not os.path.exists(dev_path):
|
||||
try:
|
||||
health_persistence.resolve_error(
|
||||
err_key, 'Device no longer present in system')
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
# Device exists — query SMART. _quick_smart_health returns
|
||||
# 'PASSED' / 'FAILED' / 'UNKNOWN'.
|
||||
try:
|
||||
smart_health = self._quick_smart_health(base_disk)
|
||||
except Exception:
|
||||
smart_health = 'UNKNOWN'
|
||||
if smart_health == 'PASSED':
|
||||
try:
|
||||
health_persistence.resolve_error(
|
||||
err_key,
|
||||
'Transient I/O cleared, SMART now reports healthy')
|
||||
except Exception:
|
||||
pass
|
||||
# else: smart UNKNOWN or FAILED — leave active and let the
|
||||
# main loop classify it on the next dmesg window.
|
||||
|
||||
def _check_disks_optimized(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Disk I/O error check -- the SINGLE source of truth for disk errors.
|
||||
|
||||
|
||||
Reads dmesg for I/O/ATA/SCSI errors, counts per device, records in
|
||||
health_persistence, and returns status for the health dashboard.
|
||||
Resolves ATA controller names (ata8) to physical disks (sda).
|
||||
|
||||
|
||||
Cross-references SMART health to avoid false positives from transient
|
||||
ATA controller errors. If SMART reports PASSED, dmesg errors are
|
||||
downgraded to INFO (transient).
|
||||
"""
|
||||
# Reconcile any disk_<name> warnings persisted across a noisy
|
||||
# shutdown / reboot before the main scan starts. Without this
|
||||
# pass the main loop only resolves errors for devices that show
|
||||
# fresh events in the current dmesg window — devices that simply
|
||||
# disappeared from dmesg stay flagged indefinitely.
|
||||
self._reconcile_stale_disk_warnings()
|
||||
|
||||
current_time = time.time()
|
||||
disk_results = {} # Single dict for both WARNING and CRITICAL
|
||||
|
||||
|
||||
Reference in New Issue
Block a user