Update notification service

2026-06-02 13:34:41 +00:00 · 2026-03-04 19:11:38 +01:00
parent 66d2a68167
commit 9089035f18
5 changed files with 504 additions and 36 deletions
@@ -1418,6 +1418,34 @@ def get_storage_info():
            # print(f"Error getting partition info: {e}")
            pass
        
+        # ── Register disks in observation system + enrich with observation counts ──
+        try:
+            active_dev_names = list(physical_disks.keys())
+            obs_counts = health_persistence.get_disks_observation_counts()
+            
+            for disk_name, disk_info in physical_disks.items():
+                # Register each disk we see
+                health_persistence.register_disk(
+                    device_name=disk_name,
+                    serial=disk_info.get('serial', ''),
+                    model=disk_info.get('model', ''),
+                    size_bytes=disk_info.get('size_bytes'),
+                )
+                
+                # Attach observation count: try serial match first, then device name
+                serial = disk_info.get('serial', '')
+                count = obs_counts.get(f'serial:{serial}', 0) if serial else 0
+                if count == 0:
+                    count = obs_counts.get(disk_name, 0)
+                disk_info['observations_count'] = count
+            
+            # Mark disks no longer present as removed
+            health_persistence.mark_removed_disks(active_dev_names)
+            # Auto-dismiss stale observations (> 30 days old)
+            health_persistence.cleanup_stale_observations()
+        except Exception:
+            pass
+        
        storage_data['disks'] = list(physical_disks.values())
        
        return storage_data
@@ -135,19 +135,22 @@ class HealthMonitor:
        # These are logged at ERR level but are common on SATA controllers
        # during hot-plug, link renegotiation, or cable noise. They are NOT
        # indicative of disk failure unless SMART also reports problems.
-        r'ata\d+.*SError.*BadCRC',
-        r'ata\d+.*Emask 0x10.*ATA bus error',
-        r'failed command: (READ|WRITE) FPDMA QUEUED',
+        # NOTE: patterns are matched against line.lower(), so use lowercase.
+        r'ata\d+.*serror.*badcrc',
+        r'ata\d+.*emask 0x10.*ata bus error',
+        r'failed command: (read|write) fpdma queued',
        r'ata\d+.*hard resetting link',
        r'ata\d+.*link is slow',
-        r'ata\d+.*COMRESET',
+        r'ata\d+.*comreset',
        
        # ── ProxMenux self-referential noise ──
        # The monitor reporting its OWN service failures is circular --
        # it cannot meaningfully alert about itself.
-        r'proxmenux-monitor\.service.*Failed',
+        # NOTE: patterns are matched against line.lower(), so use lowercase.
+        r'proxmenux-monitor\.service.*failed',
        r'proxmenux-monitor\.service.*exit-code',
-        r'ProxMenux-Monitor.*Failed at step EXEC',
+        r'proxmenux-monitor.*failed at step exec',
+        r'proxmenux-monitor\.appimage',
        
        # ── PVE scheduler operational noise ──
        # pvescheduler emits "could not update job state" every minute
@@ -1147,6 +1150,42 @@ class HealthMonitor:
        
        return storages
    
+    @staticmethod
+    def _make_io_obs_signature(disk: str, sample: str) -> str:
+        """Create a stable observation signature for I/O errors on a disk.
+        
+        All ATA errors on the same disk (exception Emask, revalidation failed,
+        hard resetting link, SError, etc.) map to ONE signature per error family.
+        This ensures that "Emask 0x1 SAct 0xc1000000" and "Emask 0x1 SAct 0x804000"
+        and "revalidation failed" all dedup into the same observation.
+        """
+        if not sample:
+            return f'io_{disk}_generic'
+        
+        s = sample.lower()
+        
+        # Classify into error families (order matters: first match wins)
+        families = [
+            # ATA controller errors: exception, emask, revalidation, reset
+            # All these are symptoms of the same underlying connection issue
+            (r'exception\s+emask|emask\s+0x|revalidation failed|hard resetting link|'
+             r'serror.*badcrc|comreset|link is slow|status.*drdy',
+             'ata_connection_error'),
+            # SCSI / block-layer errors
+            (r'i/o error|blk_update_request|medium error|sense key',
+             'block_io_error'),
+            # Failed commands (READ/WRITE FPDMA QUEUED)
+            (r'failed command|fpdma queued',
+             'ata_failed_command'),
+        ]
+        
+        for pattern, family in families:
+            if re.search(pattern, s):
+                return f'io_{disk}_{family}'
+        
+        # Fallback: generic per-disk
+        return f'io_{disk}_generic'
+
    def _resolve_ata_to_disk(self, ata_port: str) -> str:
        """Resolve an ATA controller name (e.g. 'ata8') to a block device (e.g. 'sda').
        
@@ -1444,6 +1483,26 @@ class HealthMonitor:
                        
                        smart_ok = smart_health == 'PASSED'
                        
+                        # ── Record disk observation (always, even if transient) ──
+                        # Signature must be stable across cycles: strip volatile
+                        # data (hex values, counts, timestamps) to dedup properly.
+                        # e.g. "ata8.00: exception Emask 0x1 SAct 0xc1000000"
+                        # and  "ata8.00: revalidation failed (errno=-2)"
+                        # both map to the same per-device I/O observation.
+                        try:
+                            obs_sig = self._make_io_obs_signature(disk, sample)
+                            obs_severity = 'critical' if smart_health == 'FAILED' else 'warning'
+                            health_persistence.record_disk_observation(
+                                device_name=disk,
+                                serial=None,
+                                error_type='io_error',
+                                error_signature=obs_sig,
+                                raw_message=f'{display}: {error_count} I/O event(s) in 5 min (SMART: {smart_health})\n{sample}',
+                                severity=obs_severity,
+                            )
+                        except Exception:
+                            pass
+                        
                        # Transient-only errors (e.g. SError with auto-recovery)
                        # are always INFO regardless of SMART
                        if all_transient:
@@ -150,6 +150,45 @@ class HealthPersistence:
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_severity ON notification_history(severity)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_nls_ts ON notification_last_sent(last_sent_ts)')
        
+        # ── Disk Observations System ──
+        # Registry of all physical disks seen by the system
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS disk_registry (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                device_name TEXT NOT NULL,
+                serial TEXT,
+                model TEXT,
+                size_bytes INTEGER,
+                first_seen TEXT NOT NULL,
+                last_seen TEXT NOT NULL,
+                removed INTEGER DEFAULT 0,
+                UNIQUE(device_name, serial)
+            )
+        ''')
+        
+        # Observation log: deduplicated error events per disk
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS disk_observations (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                disk_registry_id INTEGER NOT NULL,
+                error_type TEXT NOT NULL,
+                error_signature TEXT NOT NULL,
+                first_occurrence TEXT NOT NULL,
+                last_occurrence TEXT NOT NULL,
+                occurrence_count INTEGER DEFAULT 1,
+                raw_message TEXT,
+                severity TEXT DEFAULT 'warning',
+                dismissed INTEGER DEFAULT 0,
+                FOREIGN KEY(disk_registry_id) REFERENCES disk_registry(id),
+                UNIQUE(disk_registry_id, error_type, error_signature)
+            )
+        ''')
+        
+        cursor.execute('CREATE INDEX IF NOT EXISTS idx_disk_serial ON disk_registry(serial)')
+        cursor.execute('CREATE INDEX IF NOT EXISTS idx_disk_device ON disk_registry(device_name)')
+        cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_disk ON disk_observations(disk_registry_id)')
+        cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_dismissed ON disk_observations(dismissed)')
+        
        conn.commit()
        conn.close()
    
@@ -519,10 +558,12 @@ class HealthPersistence:
            }
            child_prefix = CASCADE_PREFIXES.get(error_key)
            if child_prefix:
+                # Only cascade to active (unresolved) child errors.
+                # Already-resolved/expired entries must NOT be re-surfaced.
                cursor.execute('''
                    UPDATE errors 
                    SET acknowledged = 1, resolved_at = ?, suppression_hours = ?
-                    WHERE error_key LIKE ? AND acknowledged = 0
+                    WHERE error_key LIKE ? AND acknowledged = 0 AND resolved_at IS NULL
                ''', (now, sup_hours, child_prefix + '%'))
            
            result = {
@@ -1119,5 +1160,225 @@ class HealthPersistence:
            print(f"[HealthPersistence] Error recording UNKNOWN persistent: {e}")


+    # ────────────────────────────────────────────────────────────────
+    #  Disk Observations API
+    # ────────────────────────────────────────────────────────────────
+
+    def register_disk(self, device_name: str, serial: Optional[str] = None,
+                      model: Optional[str] = None, size_bytes: Optional[int] = None):
+        """Register or update a physical disk in the registry.
+        
+        Uses (device_name, serial) as unique key. If the disk was previously
+        marked removed, it's re-activated.
+        """
+        now = datetime.now().isoformat()
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            cursor.execute('''
+                INSERT INTO disk_registry (device_name, serial, model, size_bytes, first_seen, last_seen, removed)
+                VALUES (?, ?, ?, ?, ?, ?, 0)
+                ON CONFLICT(device_name, serial) DO UPDATE SET
+                    model = COALESCE(excluded.model, model),
+                    size_bytes = COALESCE(excluded.size_bytes, size_bytes),
+                    last_seen = excluded.last_seen,
+                    removed = 0
+            ''', (device_name, serial or '', model, size_bytes, now, now))
+            
+            conn.commit()
+            conn.close()
+        except Exception as e:
+            print(f"[HealthPersistence] Error registering disk {device_name}: {e}")
+
+    def _get_disk_registry_id(self, cursor, device_name: str,
+                               serial: Optional[str] = None) -> Optional[int]:
+        """Find disk_registry.id, matching by serial first, then device_name."""
+        if serial:
+            cursor.execute(
+                'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1',
+                (serial,))
+            row = cursor.fetchone()
+            if row:
+                return row[0]
+        # Fallback: match by device_name (strip /dev/ prefix)
+        clean_dev = device_name.replace('/dev/', '')
+        cursor.execute(
+            'SELECT id FROM disk_registry WHERE device_name = ? ORDER BY last_seen DESC LIMIT 1',
+            (clean_dev,))
+        row = cursor.fetchone()
+        return row[0] if row else None
+
+    def record_disk_observation(self, device_name: str, serial: Optional[str],
+                                 error_type: str, error_signature: str,
+                                 raw_message: str = '',
+                                 severity: str = 'warning'):
+        """Record or deduplicate a disk error observation.
+        
+        error_type:  'smart_error', 'io_error', 'connection_error'
+        error_signature: Normalized unique string for dedup (e.g. 'FailedReadSmartSelfTestLog')
+        """
+        now = datetime.now().isoformat()
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            # Auto-register the disk if not present
+            clean_dev = device_name.replace('/dev/', '')
+            self.register_disk(clean_dev, serial)
+            
+            disk_id = self._get_disk_registry_id(cursor, clean_dev, serial)
+            if not disk_id:
+                conn.close()
+                return
+            
+            # Upsert observation: if same (disk, type, signature), bump count + update last_occurrence
+            cursor.execute('''
+                INSERT INTO disk_observations
+                    (disk_registry_id, error_type, error_signature, first_occurrence,
+                     last_occurrence, occurrence_count, raw_message, severity, dismissed)
+                VALUES (?, ?, ?, ?, ?, 1, ?, ?, 0)
+                ON CONFLICT(disk_registry_id, error_type, error_signature) DO UPDATE SET
+                    last_occurrence = excluded.last_occurrence,
+                    occurrence_count = occurrence_count + 1,
+                    severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END,
+                    dismissed = 0
+            ''', (disk_id, error_type, error_signature, now, now, raw_message, severity))
+            
+            conn.commit()
+            conn.close()
+        except Exception as e:
+            print(f"[HealthPersistence] Error recording disk observation: {e}")
+
+    def get_disk_observations(self, device_name: Optional[str] = None,
+                               serial: Optional[str] = None) -> List[Dict[str, Any]]:
+        """Get active (non-dismissed) observations for one disk or all disks."""
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            if device_name or serial:
+                disk_id = self._get_disk_registry_id(cursor,
+                                                      device_name or '', serial)
+                if not disk_id:
+                    conn.close()
+                    return []
+                cursor.execute('''
+                    SELECT o.id, o.error_type, o.error_signature,
+                           o.first_occurrence, o.last_occurrence,
+                           o.occurrence_count, o.raw_message, o.severity, o.dismissed,
+                           d.device_name, d.serial, d.model
+                    FROM disk_observations o
+                    JOIN disk_registry d ON o.disk_registry_id = d.id
+                    WHERE o.disk_registry_id = ? AND o.dismissed = 0
+                    ORDER BY o.last_occurrence DESC
+                ''', (disk_id,))
+            else:
+                cursor.execute('''
+                    SELECT o.id, o.error_type, o.error_signature,
+                           o.first_occurrence, o.last_occurrence,
+                           o.occurrence_count, o.raw_message, o.severity, o.dismissed,
+                           d.device_name, d.serial, d.model
+                    FROM disk_observations o
+                    JOIN disk_registry d ON o.disk_registry_id = d.id
+                    WHERE o.dismissed = 0
+                    ORDER BY o.last_occurrence DESC
+                ''')
+            
+            rows = cursor.fetchall()
+            conn.close()
+            
+            return [{
+                'id': r[0],
+                'error_type': r[1],
+                'error_signature': r[2],
+                'first_occurrence': r[3],
+                'last_occurrence': r[4],
+                'occurrence_count': r[5],
+                'raw_message': r[6] or '',
+                'severity': r[7],
+                'dismissed': bool(r[8]),
+                'device_name': r[9],
+                'serial': r[10],
+                'model': r[11],
+            } for r in rows]
+        except Exception as e:
+            print(f"[HealthPersistence] Error getting observations: {e}")
+            return []
+
+    def get_disks_observation_counts(self) -> Dict[str, int]:
+        """Return {device_name: count} of active observations per disk.
+        
+        Also includes serial-keyed entries for cross-device matching.
+        """
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            cursor.execute('''
+                SELECT d.device_name, d.serial, COUNT(o.id) as cnt
+                FROM disk_observations o
+                JOIN disk_registry d ON o.disk_registry_id = d.id
+                WHERE o.dismissed = 0
+                GROUP BY d.id
+            ''')
+            result = {}
+            for device_name, serial, cnt in cursor.fetchall():
+                result[device_name] = cnt
+                if serial:
+                    result[f'serial:{serial}'] = cnt
+            conn.close()
+            return result
+        except Exception as e:
+            print(f"[HealthPersistence] Error getting observation counts: {e}")
+            return {}
+
+    def dismiss_disk_observation(self, observation_id: int):
+        """Mark a single observation as dismissed."""
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            cursor.execute(
+                'UPDATE disk_observations SET dismissed = 1 WHERE id = ?',
+                (observation_id,))
+            conn.commit()
+            conn.close()
+        except Exception as e:
+            print(f"[HealthPersistence] Error dismissing observation: {e}")
+
+    def cleanup_stale_observations(self, max_age_days: int = 30):
+        """Auto-dismiss observations not seen in max_age_days."""
+        try:
+            from datetime import timedelta
+            cutoff = (datetime.now() - timedelta(days=max_age_days)).isoformat()
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            cursor.execute('''
+                UPDATE disk_observations 
+                SET dismissed = 1 
+                WHERE dismissed = 0 AND last_occurrence < ?
+            ''', (cutoff,))
+            conn.commit()
+            conn.close()
+        except Exception as e:
+            print(f"[HealthPersistence] Error cleaning stale observations: {e}")
+
+    def mark_removed_disks(self, active_device_names: List[str]):
+        """Mark disks not in active_device_names as removed."""
+        try:
+            now = datetime.now().isoformat()
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            if active_device_names:
+                placeholders = ','.join('?' for _ in active_device_names)
+                cursor.execute(f'''
+                    UPDATE disk_registry SET removed = 1
+                    WHERE device_name NOT IN ({placeholders}) AND removed = 0
+                ''', active_device_names)
+            conn.commit()
+            conn.close()
+        except Exception as e:
+            print(f"[HealthPersistence] Error marking removed disks: {e}")
+
+
 # Global instance
 health_persistence = HealthPersistence()
@@ -690,6 +690,68 @@ class JournalWatcher:
        except Exception:
            return 'UNKNOWN'
    
+    def _record_smartd_observation(self, title: str, message: str):
+        """Extract device info from a smartd system-mail and record as disk observation."""
+        try:
+            import re as _re
+            from health_persistence import health_persistence
+            
+            # Extract device path: "Device: /dev/sdh [SAT]" or "Device: /dev/sda"
+            dev_match = _re.search(r'Device:\s*/dev/(\S+?)[\s\[\],]', message)
+            device = dev_match.group(1) if dev_match else ''
+            if not device:
+                return
+            # Strip partition suffix and SAT prefix
+            base_dev = _re.sub(r'\d+$', '', device)
+            
+            # Extract serial: "S/N:WD-WX72A30AA72R"
+            sn_match = _re.search(r'S/N:\s*(\S+)', message)
+            serial = sn_match.group(1) if sn_match else ''
+            
+            # Extract model: appears before S/N on the "Device info:" line
+            model = ''
+            model_match = _re.search(r'Device info:\s*\n?\s*(.+?)(?:,\s*S/N:)', message)
+            if model_match:
+                model = model_match.group(1).strip()
+            
+            # Extract error signature from title: "SMART error (FailedReadSmartSelfTestLog)"
+            sig_match = _re.search(r'SMART error\s*\((\w+)\)', title)
+            if sig_match:
+                error_signature = sig_match.group(1)
+                error_type = 'smart_error'
+            else:
+                # Fallback: extract the "warning/error logged" line
+                warn_match = _re.search(
+                    r'warning/error was logged.*?:\s*\n?\s*(.+)', message, _re.IGNORECASE)
+                if warn_match:
+                    error_signature = _re.sub(r'[^a-zA-Z0-9_]', '_',
+                                              warn_match.group(1).strip())[:80]
+                else:
+                    error_signature = _re.sub(r'[^a-zA-Z0-9_]', '_', title)[:80]
+                error_type = 'smart_error'
+            
+            # Build a clean raw_message for display
+            raw_msg = f"Device: /dev/{base_dev}"
+            if model:
+                raw_msg += f" ({model})"
+            if serial:
+                raw_msg += f" S/N:{serial}"
+            warn_line_m = _re.search(
+                r'The following warning/error.*?:\s*\n?\s*(.+)', message, _re.IGNORECASE)
+            if warn_line_m:
+                raw_msg += f"\n{warn_line_m.group(1).strip()}"
+            
+            health_persistence.record_disk_observation(
+                device_name=base_dev,
+                serial=serial,
+                error_type=error_type,
+                error_signature=error_signature,
+                raw_message=raw_msg,
+                severity='warning',
+            )
+        except Exception as e:
+            print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}")
+
    @staticmethod
    def _translate_ata_error(msg: str) -> str:
        """Translate common ATA/SCSI error codes to human-readable descriptions."""
@@ -1393,15 +1455,42 @@ class PollingCollector:
    Tracking is stored in ``notification_last_sent`` (same DB).
    """
    
-    DIGEST_INTERVAL = 86400       # 24 h between re-notifications
+    DIGEST_INTERVAL = 86400       # 24 h default between re-notifications
    UPDATE_CHECK_INTERVAL = 86400 # 24 h between update scans
    NEW_ERROR_WINDOW = 120        # seconds – errors younger than this are "new"
    
+    # Per-category anti-oscillation cooldowns (seconds).
+    # When an error resolves briefly and reappears, we still respect this
+    # interval before notifying again.  This prevents "semi-cascades" where
+    # the same root cause generates many slightly different notifications.
+    #
+    # Key = health_persistence category name
+    # Value = minimum seconds between notifications for the same error_key
+    _CATEGORY_COOLDOWNS = {
+        'disks':        86400,   # 24h - I/O errors are persistent hardware issues
+        'smart':        86400,   # 24h - SMART errors same as I/O
+        'zfs':          86400,   # 24h - ZFS pool issues are persistent
+        'storage':      3600,    # 1h  - storage availability can oscillate
+        'network':      1800,    # 30m - network can flap
+        'pve_services': 1800,    # 30m - services can restart/oscillate
+        'temperature':  3600,    # 1h  - temp can fluctuate near thresholds
+        'logs':         3600,    # 1h  - repeated log patterns
+        'vms':          1800,    # 30m - VM state oscillation
+        'security':     3600,    # 1h  - auth failures tend to be bursty
+        'cpu':          1800,    # 30m - CPU spikes can be transient
+        'memory':       1800,    # 30m - memory pressure oscillation
+        'disk':         3600,    # 1h  - disk space can fluctuate near threshold
+        'updates':      86400,   # 24h - update info doesn't change fast
+    }
+    
    _ENTITY_MAP = {
        'cpu': ('node', ''), 'memory': ('node', ''), 'temperature': ('node', ''),
-        'disk': ('storage', ''), 'network': ('network', ''),
+        'load': ('node', ''),
+        'disk': ('storage', ''), 'disks': ('storage', ''), 'smart': ('storage', ''),
+        'zfs': ('storage', ''), 'storage': ('storage', ''),
+        'network': ('network', ''),
        'pve_services': ('node', ''), 'security': ('user', ''),
-        'updates': ('node', ''), 'storage': ('storage', ''),
+        'updates': ('node', ''), 'logs': ('node', ''), 'vms': ('vm', ''),
    }
    
    # Map health-persistence category names to our TEMPLATES event types.
@@ -1412,14 +1501,14 @@ class PollingCollector:
        'load': 'load_high',
        'temperature': 'temp_high',
        'disk': 'disk_space_low',
+        'disks': 'disk_io_error',         # I/O errors from health monitor
+        'smart': 'disk_io_error',         # SMART errors from health monitor
+        'zfs': 'disk_io_error',           # ZFS pool/disk errors
        'storage': 'storage_unavailable',
        'network': 'network_down',
        'pve_services': 'service_fail',
        'security': 'auth_fail',
        'updates': 'update_summary',
-        'zfs': 'disk_io_error',
-        'smart': 'disk_io_error',
-        'disks': 'disk_io_error',
        'logs': 'system_problem',
        'vms': 'system_problem',
    }
@@ -1547,34 +1636,46 @@ class PollingCollector:
            # Determine if we should notify
            is_new = error_key not in self._known_errors
            last_sent = self._last_notified.get(error_key, 0)
-            is_due = (now - last_sent) >= self.DIGEST_INTERVAL
+            cat_cooldown = self._CATEGORY_COOLDOWNS.get(category, self.DIGEST_INTERVAL)
+            is_due = (now - last_sent) >= cat_cooldown
            
-            # For re-notifications (not new): skip if stale OR not due
+            # Anti-oscillation: even if "new" (resolved then reappeared),
+            # respect the per-category cooldown interval.  This prevents
+            # "semi-cascades" where the same root cause generates multiple
+            # slightly different notifications across health check cycles.
+            # Each category has its own appropriate cooldown (30m for network,
+            # 24h for disks, 1h for temperature, etc.).
+            if not is_due:
+                continue
+            
+            # For re-notifications (not new): also skip if stale
            if not is_new:
-                if error_is_stale or not is_due:
+                if error_is_stale:
                    continue
            
            # Map to our event type
            event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem')
            entity, eid = self._ENTITY_MAP.get(category, ('node', ''))
            
-            # ── SMART gate for disk errors ──
-            # If the health monitor recorded a disk error but SMART is NOT
-            # FAILED, skip the notification entirely.  Disk notifications
-            # should ONLY be sent when SMART confirms a real hardware failure.
-            # This prevents WARNING-level disk errors (SMART: unavailable)
-            # from being emitted as notifications at all.
+            # ── Disk I/O notification policy ──
+            # Disk I/O errors are ALWAYS notified (even when SMART says Passed)
+            # because recurring I/O errors are real issues that should not be hidden.
+            # The 24h cooldown is enforced per-device by NotificationManager
+            # (event_type 'disk_io_error' gets 86400s cooldown).
+            # For transient/INFO-level disk events (SMART OK, low error count),
+            # the health monitor already resolves them, so they won't appear here.
            if category in ('disks', 'smart', 'zfs'):
-                details = error.get('details', {})
-                if isinstance(details, str):
+                details_raw = error.get('details', {})
+                if isinstance(details_raw, str):
                    try:
-                        details = json.loads(details)
+                        details_raw = json.loads(details_raw)
                    except (json.JSONDecodeError, TypeError):
-                        details = {}
-                smart_status = details.get('smart_status', '') if isinstance(details, dict) else ''
-                if smart_status != 'FAILED':
-                    # SMART is PASSED, UNKNOWN, or unavailable -- don't notify
-                    continue
+                        details_raw = {}
+                if isinstance(details_raw, dict):
+                    # Extract device name for a stable entity_id (24h cooldown key)
+                    dev = details_raw.get('device', details_raw.get('disk', ''))
+                    if dev:
+                        eid = f'disk_{dev}'  # Stable per-device fingerprint
            
            # Updates are always informational notifications except
            # system_age which can be WARNING (365+ days) or CRITICAL (548+ days).
@@ -2020,11 +2121,12 @@ class ProxmoxHookWatcher:
            msg_lower = (message or '').lower()
            title_lower_sm = (title or '').lower()
            
-            # ── Filter smartd noise ──
-            # FailedReadSmartErrorLog: smartd can't read the error log -- this is
-            # a firmware quirk on some WD/Seagate drives, NOT a disk failure.
-            # FailedReadSmartData: similar firmware issue.
-            # These should NOT generate notifications.
+            # ── Record disk observation regardless of noise filter ──
+            # Even "noise" events are recorded as observations so the user
+            # can see them in the Storage UI.  We just don't send notifications.
+            self._record_smartd_observation(title or '', message or '')
+            
+            # ── Filter smartd noise (suppress notification, not observation) ──
            smartd_noise = [
                'failedreadsmarterrorlog',
                'failedreadsmartdata',
@@ -767,11 +767,29 @@ class NotificationManager:
        # Same as Proxmox's notification policy.  The JournalWatcher already
        # gates these through SMART verification + its own 24h dedup, but
        # this acts as defense-in-depth in case a disk event arrives from
-        # another source (PollingCollector, hooks, etc.).
+        # another source (PollingCollector, hooks, health monitor, etc.).
        _DISK_EVENTS = {'disk_io_error', 'storage_unavailable'}
        if event.event_type in _DISK_EVENTS and cooldown_str is None:
            cooldown = 86400  # 24 hours
        
+        # Health monitor state_change events: per-category cooldowns.
+        # Different health categories need different re-notification intervals.
+        # This is the defense-in-depth layer matching HealthEventWatcher's
+        # _CATEGORY_COOLDOWNS to prevent semi-cascades across all categories.
+        _HEALTH_CATEGORY_COOLDOWNS = {
+            'disks': 86400, 'smart': 86400, 'zfs': 86400,   # 24h
+            'storage': 3600, 'temperature': 3600, 'logs': 3600,
+            'security': 3600, 'disk': 3600,                  # 1h
+            'network': 1800, 'pve_services': 1800,
+            'vms': 1800, 'cpu': 1800, 'memory': 1800,       # 30m
+            'updates': 86400,                                 # 24h
+        }
+        if event.event_type == 'state_change' and event.source == 'health':
+            cat = (event.data or {}).get('category', '')
+            cat_cd = _HEALTH_CATEGORY_COOLDOWNS.get(cat)
+            if cat_cd and cooldown_str is None:
+                cooldown = max(cooldown, cat_cd)
+        
        # Backup/replication events: each execution is unique and should
        # always be delivered. A 10s cooldown prevents exact duplicates
        # (webhook + tasks) but allows repeated backup jobs to report.