update notification_events.py

2026-05-30 12:04:43 +00:00 · 2026-04-09 14:08:56 +02:00
parent 2b8caa924f
commit 435f346d98
2 changed files with 241 additions and 3 deletions
@@ -739,6 +739,14 @@ class HealthPersistence:
            }
            conn.commit()
            conn.close()
+            
+            # ── Clear cooldowns for newly dismissed errors too ──
+            if sup_hours != -1:
+                if category == 'disks':
+                    self._clear_disk_io_cooldown(error_key)
+                else:
+                    self._clear_notification_cooldown(error_key)
+            
            return result
        
        if row:
@@ -803,6 +811,19 @@ class HealthPersistence:
        
        conn.commit()
        conn.close()
+        
+        # ── Coordinate with notification cooldowns ──
+        # When an error is dismissed with non-permanent suppression,
+        # clear the corresponding cooldown in notification_last_sent
+        # so it can re-notify after the suppression period expires.
+        # This applies to ALL categories, not just disks.
+        if sup_hours != -1:
+            if category == 'disks':
+                self._clear_disk_io_cooldown(error_key)
+            else:
+                # For non-disk categories, clear the PollingCollector cooldown
+                self._clear_notification_cooldown(error_key)
+        
        return result
    
    def is_error_acknowledged(self, error_key: str) -> bool:
@@ -2732,5 +2753,123 @@ class HealthPersistence:
            return set()


+    def _clear_notification_cooldown(self, error_key: str):
+        """
+        Clear notification cooldown from notification_last_sent for non-disk errors.
+        
+        This coordinates with PollingCollector's 24h cooldown system.
+        When any error is dismissed, we remove the corresponding cooldown entry
+        so the error can be re-detected and re-notified after the suppression period expires.
+        
+        The PollingCollector uses 'health_' prefix for all its fingerprints.
+        """
+        try:
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            # PollingCollector uses 'health_' prefix
+            fp = f'health_{error_key}'
+            cursor.execute(
+                'DELETE FROM notification_last_sent WHERE fingerprint = ?',
+                (fp,)
+            )
+            
+            # Also delete any fingerprints that match the error_key pattern
+            cursor.execute(
+                'DELETE FROM notification_last_sent WHERE fingerprint LIKE ?',
+                (f'%{error_key}%',)
+            )
+            
+            deleted_count = cursor.rowcount
+            conn.commit()
+            conn.close()
+            
+            if deleted_count > 0:
+                print(f"[HealthPersistence] Cleared notification cooldowns for {error_key}")
+        except Exception as e:
+            print(f"[HealthPersistence] Error clearing notification cooldown: {e}")
+    
+    def _clear_disk_io_cooldown(self, error_key: str):
+        """
+        Clear disk I/O cooldowns from notification_last_sent when an error is dismissed.
+        
+        This coordinates with BOTH:
+        1. JournalWatcher's 24h cooldown system (prefixes: diskio_, fs_, fs_serial_)
+        2. PollingCollector's 24h cooldown system (prefix: health_)
+        
+        When a disk error is dismissed, we remove the corresponding cooldown entries
+        so the error can be re-detected and re-notified after the suppression period expires.
+        
+        Matches fingerprints like:
+        - diskio_sdh, diskio_sda, diskio_nvme0n1
+        - fs_sdh1, fs_sda2, fs_serial_XXXXX
+        - health_disk_smart_sdh, health_disk_io_error_sdh
+        - sdh (direct device name used by JournalWatcher)
+        """
+        try:
+            # Extract device name from error_key
+            # Common patterns: disk_fs_sdh, disk_smart_sda, disk_io_error_sdh, smart_sdh
+            import re
+            device_match = re.search(r'(?:disk_fs_|disk_smart_|disk_io_error_|disk_|smart_|io_error_)(?:/dev/)?([a-z]{2,4}[a-z0-9]*)', error_key)
+            if not device_match:
+                # Try to extract device from error_key directly if no pattern matches
+                # e.g., error_key might just be the device name
+                device_match = re.match(r'^([a-z]{2,4}[a-z0-9]*)$', error_key)
+                if not device_match:
+                    return
+            
+            device = device_match.group(1)
+            base_device = re.sub(r'\d+$', '', device)  # sdh1 -> sdh
+            
+            # Build patterns to match in notification_last_sent
+            # JournalWatcher uses: direct device name, diskio_, fs_, fs_serial_
+            # PollingCollector uses: health_ prefix
+            patterns = [
+                # JournalWatcher patterns
+                device,  # Direct device name (JournalWatcher._check_disk_io uses this)
+                base_device,
+                f'diskio_{device}',
+                f'diskio_{base_device}',
+                f'fs_{device}',
+                f'fs_{base_device}',
+                # PollingCollector patterns (uses health_ prefix)
+                f'health_{error_key}',
+                f'health_disk_smart_{device}',
+                f'health_disk_smart_{base_device}',
+                f'health_disk_io_error_{device}',
+                f'health_disk_io_error_{base_device}',
+                f'health_disk_fs_{device}',
+                f'health_disk_fs_{base_device}',
+            ]
+            
+            conn = self._get_conn()
+            cursor = conn.cursor()
+            
+            # Delete matching cooldown entries
+            for pattern in patterns:
+                cursor.execute(
+                    'DELETE FROM notification_last_sent WHERE fingerprint = ?',
+                    (pattern,)
+                )
+                # Also match with wildcards for serial-based keys
+                cursor.execute(
+                    'DELETE FROM notification_last_sent WHERE fingerprint LIKE ?',
+                    (f'{pattern}%',)
+                )
+            
+            # Also clear fingerprints that contain the device name anywhere
+            # This catches edge cases like different fingerprint formats
+            cursor.execute(
+                'DELETE FROM notification_last_sent WHERE fingerprint LIKE ? OR fingerprint LIKE ?',
+                (f'%{device}%', f'%{base_device}%' if base_device != device else f'%{device}%')
+            )
+            
+            conn.commit()
+            conn.close()
+            print(f"[HealthPersistence] Cleared disk I/O cooldowns for {error_key} (device: {device})")
+        except Exception as e:
+            print(f"[HealthPersistence] Error clearing disk I/O cooldown: {e}")
+
+
 # Global instance
 health_persistence = HealthPersistence()
@@ -319,6 +319,41 @@ class JournalWatcher:
        except Exception as e:
            print(f"[JournalWatcher] Failed to save disk_io_notified: {e}")
    
+    def _get_disk_io_cooldown_from_db(self, device: str) -> Optional[float]:
+        """
+        Get disk I/O cooldown timestamp from DB for a device.
+        
+        Used to re-check DB when user might have dismissed the error,
+        which clears the DB entry via health_persistence._clear_disk_io_cooldown().
+        
+        Returns the timestamp if found and within 24h window, None otherwise.
+        """
+        try:
+            db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
+            if not db_path.exists():
+                return None
+            conn = sqlite3.connect(str(db_path), timeout=5)
+            conn.execute('PRAGMA busy_timeout=3000')
+            cursor = conn.cursor()
+            
+            # Check for the device with various prefixes
+            # JournalWatcher uses direct device names as keys
+            cursor.execute(
+                "SELECT last_sent_ts FROM notification_last_sent WHERE fingerprint = ?",
+                (device,)
+            )
+            row = cursor.fetchone()
+            conn.close()
+            
+            if row:
+                ts = float(row[0])
+                # Only return if within 24h window
+                if time.time() - ts < self._DISK_IO_COOLDOWN:
+                    return ts
+            return None
+        except Exception:
+            return None
+    
    def stop(self):
        """Stop the journal watcher."""
        self._running = False
@@ -589,7 +624,14 @@ class JournalWatcher:
                        fs_dedup_key = f'fs_{device}'
                    last_fs_notified = self._disk_io_notified.get(fs_dedup_key, 0)
                    if now_fs - last_fs_notified < self._DISK_IO_COOLDOWN:
-                        return  # Already notified for this device recently
+                        # In-memory says cooldown active. Re-check DB in case
+                        # user dismissed the error (which clears DB cooldowns).
+                        db_ts = self._get_disk_io_cooldown_from_db(fs_dedup_key)
+                        if db_ts is not None and now_fs - db_ts < self._DISK_IO_COOLDOWN:
+                            return  # DB confirms cooldown is still active
+                        # DB says cooldown was cleared - proceed
+                        if fs_dedup_key in self._disk_io_notified:
+                            del self._disk_io_notified[fs_dedup_key]
                    
                    # ── Device existence gating ──
                    device_exists = base_dev and _os.path.exists(f'/dev/{base_dev}')
@@ -842,10 +884,24 @@ class JournalWatcher:
                return
            
            # ── Gate 2: 24-hour dedup per device ──
+            # Check both in-memory cache AND the DB (user dismiss clears DB cooldowns).
+            # If user dismissed the error, _clear_disk_io_cooldown() removed the DB
+            # entry, so we should refresh from DB to get the real state.
            now = time.time()
+            
+            # First check in-memory cache
            last_notified = self._disk_io_notified.get(resolved, 0)
+            
            if now - last_notified < self._DISK_IO_COOLDOWN:
-                return  # Already notified for this disk recently
+                # In-memory says we already notified. But user might have dismissed
+                # the error, which clears the DB. Re-check DB to be sure.
+                db_ts = self._get_disk_io_cooldown_from_db(resolved)
+                if db_ts is not None and now - db_ts < self._DISK_IO_COOLDOWN:
+                    return  # DB confirms cooldown is still active
+                # DB says cooldown was cleared (user dismissed) - proceed to notify
+                # Update in-memory cache
+                del self._disk_io_notified[resolved]
+            
            self._disk_io_notified[resolved] = now
            self._save_disk_io_notified(resolved, now)
            
@@ -2069,8 +2125,16 @@ class PollingCollector:
            # ── SAME ERROR COOLDOWN (24h) ──
            # The SAME error_key cannot be re-notified before 24 hours.
            # This is the PRIMARY deduplication mechanism.
+            # EXCEPTION: If user dismissed the error, the cooldown is cleared in DB
+            # and we should re-check DB to see if cooldown still applies.
            if time_since_last < self.SAME_ERROR_COOLDOWN:
-                continue
+                # Check if user dismissed this - clears DB cooldown
+                db_ts = self._get_cooldown_from_db(error_key)
+                if db_ts is not None and now - db_ts < self.SAME_ERROR_COOLDOWN:
+                    continue  # DB confirms cooldown still active
+                # DB says cooldown was cleared (user dismissed) - remove from memory
+                self._last_notified.pop(error_key, None)
+                # Continue to the next checks (category cooldown etc.)
            
            # ── CATEGORY COOLDOWN (varies) ──
            # DIFFERENT errors within the same category respect category cooldown.
@@ -2735,6 +2799,41 @@ class PollingCollector:
            conn.close()
        except Exception:
            pass
+    
+    def _get_cooldown_from_db(self, error_key: str) -> Optional[float]:
+        """
+        Get cooldown timestamp from DB for an error_key.
+        
+        Used to re-check DB when user might have dismissed the error,
+        which clears the DB entry via health_persistence._clear_disk_io_cooldown().
+        
+        Returns the timestamp if found and within 24h window, None otherwise.
+        """
+        try:
+            db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
+            if not db_path.exists():
+                return None
+            conn = sqlite3.connect(str(db_path), timeout=5)
+            conn.execute('PRAGMA busy_timeout=3000')
+            cursor = conn.cursor()
+            
+            # PollingCollector uses 'health_' prefix for its fingerprints
+            fp = f'health_{error_key}'
+            cursor.execute(
+                "SELECT last_sent_ts FROM notification_last_sent WHERE fingerprint = ?",
+                (fp,)
+            )
+            row = cursor.fetchone()
+            conn.close()
+            
+            if row:
+                ts = float(row[0])
+                # Only return if within 24h window
+                if time.time() - ts < self.SAME_ERROR_COOLDOWN:
+                    return ts
+            return None
+        except Exception:
+            return None


 # ─── Proxmox Webhook Receiver ───────────────────────────────────