update health_persistence.py

2026-04-23 20:10:39 +00:00 · 2026-04-06 12:02:05 +02:00
parent 5ead9ee661
commit adde2ce5b9
5 changed files with 245 additions and 104 deletions
@@ -174,8 +174,9 @@ def capture_journal_context(keywords: list, lines: int = 30,
            return ""
        
        # Use journalctl with grep to filter relevant lines
+        # Use -b 0 to only include logs from the current boot (not previous boots)
        cmd = (
-            f"journalctl --since='{since}' --no-pager -n 500 2>/dev/null | "
+            f"journalctl -b 0 --since='{since}' --no-pager -n 500 2>/dev/null | "
            f"grep -iE '{pattern}' | tail -n {lines}"
        )
        
@@ -1800,6 +1801,8 @@ class PollingCollector:
    # Key = health_persistence category name
    # Value = minimum seconds between notifications for the same error_key
    _CATEGORY_COOLDOWNS = {
+        # Category cooldown: minimum time between DIFFERENT errors of the same category
+        # This prevents notification storms when multiple issues arise together
        'disks':        86400,   # 24h - I/O errors are persistent hardware issues
        'smart':        86400,   # 24h - SMART errors same as I/O
        'zfs':          86400,   # 24h - ZFS pool issues are persistent
@@ -1809,6 +1812,7 @@ class PollingCollector:
        'temperature':  3600,    # 1h  - temp can fluctuate near thresholds
        'logs':         3600,    # 1h  - repeated log patterns
        'vms':          1800,    # 30m - VM state oscillation
+        'vmct':         1800,    # 30m - VM/CT state oscillation
        'security':     3600,    # 1h  - auth failures tend to be bursty
        'cpu':          1800,    # 30m - CPU spikes can be transient
        'memory':       1800,    # 30m - memory pressure oscillation
@@ -1816,6 +1820,10 @@ class PollingCollector:
        'updates':      86400,   # 24h - update info doesn't change fast
    }
    
+    # Global cooldown: minimum time before the SAME error can be re-notified
+    # This is independent of category - same error_key cannot repeat before this time
+    SAME_ERROR_COOLDOWN = 86400  # 24 hours
+    
    _ENTITY_MAP = {
        'cpu': ('node', ''), 'memory': ('node', ''), 'temperature': ('node', ''),
        'load': ('node', ''),
@@ -2032,15 +2040,20 @@ class PollingCollector:
            # Determine if we should notify
            is_new = error_key not in self._known_errors
            last_sent = self._last_notified.get(error_key, 0)
-            cat_cooldown = self._CATEGORY_COOLDOWNS.get(category, self.DIGEST_INTERVAL)
-            is_due = (now - last_sent) >= cat_cooldown
+            time_since_last = now - last_sent
+            
+            # ── SAME ERROR COOLDOWN (24h) ──
+            # The SAME error_key cannot be re-notified before 24 hours.
+            # This is the PRIMARY deduplication mechanism.
+            if time_since_last < self.SAME_ERROR_COOLDOWN:
+                continue
+            
+            # ── CATEGORY COOLDOWN (varies) ──
+            # DIFFERENT errors within the same category respect category cooldown.
+            # This prevents notification storms when multiple issues arise together.
+            cat_cooldown = self._CATEGORY_COOLDOWNS.get(category, self.DIGEST_INTERVAL)
+            is_due = time_since_last >= cat_cooldown
            
-            # Anti-oscillation: even if "new" (resolved then reappeared),
-            # respect the per-category cooldown interval.  This prevents
-            # "semi-cascades" where the same root cause generates multiple
-            # slightly different notifications across health check cycles.
-            # Each category has its own appropriate cooldown (30m for network,
-            # 24h for disks, 1h for temperature, etc.).
            if not is_due:
                continue