update health_persistence.py

2026-05-25 01:54:42 +00:00 · 2026-04-01 12:03:54 +02:00
parent a734fa5566
commit d62396717a
6 changed files with 22 additions and 172 deletions
@@ -458,7 +458,7 @@ def delete_storage_exclusion(storage_name):
        return jsonify({'error': str(e)}), 500


-# ═══════════════════════════════════════════════════════════════════════════
+# ══════════════════════════════════════════════════════════════════════════
 # NETWORK INTERFACE EXCLUSION ROUTES
 # ═══════════════════════════════════════════════════════════════════════════

@@ -228,7 +228,6 @@ class HealthMonitor:
    
    def __init__(self):
        """Initialize health monitor with state tracking"""
-        print("[HealthMonitor] Version 2026-03-31-v2 - Stale resource cleanup enabled")
        self.state_history = defaultdict(list)
        self.last_check_times = {}
        self.cached_results = {}
@@ -1218,10 +1217,7 @@ class HealthMonitor:
                                'dismissable': True,
                            }
                        )
-                    # Update worst_health for the disk (persists even if current error clears)
-                    # Use serial for proper USB disk tracking
-                    health_persistence.update_disk_worst_health(device, disk_serial if disk_serial else None, severity.lower())
-                    # Also register the disk for observation tracking
+                    # Register the disk for observation tracking (worst_health no longer used)
                    if disk_serial:
                        health_persistence.register_disk(device, disk_serial, disk_model, 0)
                except Exception:
@@ -1242,7 +1238,7 @@ class HealthMonitor:
                if disk_path not in storage_details or storage_details[disk_path].get('status') == 'OK':
                    issues.append(f'{disk_path}: {disk_info.get("reason", "I/O errors")}')
                    storage_details[disk_path] = disk_info
-                # Update worst_health for I/O errors
+                
                device = disk_path.replace('/dev/', '')
                io_severity = disk_info.get('status', 'WARNING').lower()
                
@@ -1262,8 +1258,8 @@ class HealthMonitor:
                except Exception:
                    pass
                
+                # Register the disk for observation tracking (worst_health no longer used)
                try:
-                    health_persistence.update_disk_worst_health(device, io_serial if io_serial else None, io_severity)
                    if io_serial:
                        health_persistence.register_disk(device, io_serial, io_model, 0)
                except Exception:
@@ -1459,24 +1455,10 @@ class HealthMonitor:
            serial = disk_info.get('serial', '')
            model = disk_info.get('model', '')
            
-            # Get worst_health from persistence
+            # Use current status directly from Proxmox/SMART - no persistent worst_health
+            # Historical observations are preserved separately in disk_observations table
            current_status = error_info.get('status', 'WARNING')
-            try:
-                health_status = health_persistence.get_disk_health_status(device_name, serial if serial else None)
-                worst_health = health_status.get('worst_health', 'healthy')
-                
-                # Final health = max(current, worst)
-                health_order = {'healthy': 0, 'ok': 0, 'warning': 1, 'critical': 2}
-                current_level = health_order.get(current_status.lower(), 1)
-                worst_level = health_order.get(worst_health.lower(), 0)
-                
-                if worst_level > current_level:
-                    # worst_health is worse, use it
-                    final_status = worst_health.upper()
-                else:
-                    final_status = current_status
-            except Exception:
-                final_status = current_status
+            final_status = current_status
            
            # Build detail string with serial/model if available
            detail = error_info.get('detail', error_info.get('reason', 'Unknown error'))
@@ -868,7 +868,6 @@ class HealthPersistence:
            return self._cleanup_old_errors_impl()
    
    def _cleanup_old_errors_impl(self):
-        print("[HealthPersistence] Running cleanup_old_errors...")
        conn = self._get_conn()
        cursor = conn.cursor()
        
@@ -963,14 +962,13 @@ class HealthPersistence:
        now_iso = now.isoformat()
        
        # Get all active (unresolved) errors with first_seen and last_seen for age checks
+        # An error is considered unresolved if resolution_type is NULL or empty
+        # (resolved_at alone is not sufficient - it may be in an inconsistent state)
        cursor.execute('''
            SELECT id, error_key, category, reason, first_seen, last_seen, severity FROM errors 
-            WHERE resolved_at IS NULL
+            WHERE resolution_type IS NULL OR resolution_type = ''
        ''')
        active_errors = cursor.fetchall()
-        
-        print(f"[HealthPersistence] _cleanup_stale_resources: Found {len(active_errors)} active errors to check")
-        
        resolved_count = 0
        
        # Cache for expensive checks (avoid repeated subprocess calls)
@@ -1086,13 +1084,9 @@ class HealthPersistence:
            # Check if VM/CT still exists (covers: vms/vmct categories, vm_*, ct_*, vmct_* error keys)
            if category in ('vms', 'vmct') or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_') or error_key.startswith('vmct_'))):
                vmid = extract_vmid_from_text(error_key) or extract_vmid_from_text(reason)
-                print(f"[HealthPersistence] Checking VM/CT error: key={error_key}, category={category}, vmid={vmid}")
-                if vmid:
-                    exists = check_vm_ct_cached(vmid)
-                    print(f"[HealthPersistence] VM/CT {vmid} exists: {exists}")
-                    if not exists:
-                        should_resolve = True
-                        resolution_reason = 'VM/CT deleted'
+                if vmid and not check_vm_ct_cached(vmid):
+                    should_resolve = True
+                    resolution_reason = 'VM/CT deleted'
            
            # === DISK ERRORS ===
            # Check if disk device or ZFS pool still exists
@@ -1207,7 +1201,6 @@ class HealthPersistence:
                resolution_reason = 'Stale error (no activity >7d)'
            
            if should_resolve:
-                print(f"[HealthPersistence] Resolving error: {error_key} - {resolution_reason}")
                cursor.execute('''
                    UPDATE errors SET resolved_at = ?, resolution_type = 'auto', resolution_reason = ?
                    WHERE id = ?
@@ -1862,130 +1855,10 @@ class HealthPersistence:
            pass
        return None

-    def update_disk_worst_health(self, device_name: str, serial: Optional[str],
-                                   new_health: str) -> bool:
-        """Update worst_health if new_health is worse than current.
-        
-        Health hierarchy: healthy < warning < critical
-        Only escalates, never downgrades automatically.
-        
-        Returns True if worst_health was updated.
-        """
-        health_order = {'healthy': 0, 'warning': 1, 'critical': 2}
-        new_level = health_order.get(new_health.lower(), 0)
-        
-        if new_level == 0:  # healthy never updates worst_health
-            return False
-        
-        now = datetime.now().isoformat()
-        try:
-            conn = self._get_conn()
-            cursor = conn.cursor()
-            
-            disk_id = self._get_disk_registry_id(cursor, device_name, serial)
-            if not disk_id:
-                # Register disk first
-                self.register_disk(device_name.replace('/dev/', ''), serial)
-                disk_id = self._get_disk_registry_id(cursor, device_name, serial)
-            
-            if not disk_id:
-                conn.close()
-                return False
-            
-            # Get current worst_health
-            cursor.execute('SELECT worst_health FROM disk_registry WHERE id = ?', (disk_id,))
-            row = cursor.fetchone()
-            current_worst = row[0] if row and row[0] else 'healthy'
-            current_level = health_order.get(current_worst.lower(), 0)
-            
-            # Only update if new health is worse
-            if new_level > current_level:
-                cursor.execute('''
-                    UPDATE disk_registry 
-                    SET worst_health = ?, worst_health_date = ?, admin_cleared = NULL
-                    WHERE id = ?
-                ''', (new_health.lower(), now, disk_id))
-                conn.commit()
-                conn.close()
-                return True
-            
-            conn.close()
-            return False
-        except Exception as e:
-            print(f"[HealthPersistence] Error updating worst_health for {device_name}: {e}")
-            return False
-
-    def get_disk_health_status(self, device_name: str, serial: Optional[str] = None) -> Dict[str, Any]:
-        """Get the health status of a disk including worst_health.
-        
-        Returns dict with:
-          - worst_health: 'healthy', 'warning', or 'critical'
-          - worst_health_date: ISO timestamp when worst_health was set
-          - admin_cleared: ISO timestamp if admin manually cleared the health
-          - observations_count: Number of recorded observations
-        """
-        try:
-            conn = self._get_conn()
-            cursor = conn.cursor()
-            
-            disk_id = self._get_disk_registry_id(cursor, device_name, serial)
-            if not disk_id:
-                conn.close()
-                return {'worst_health': 'healthy', 'observations_count': 0}
-            
-            cursor.execute('''
-                SELECT worst_health, worst_health_date, admin_cleared
-                FROM disk_registry WHERE id = ?
-            ''', (disk_id,))
-            row = cursor.fetchone()
-            
-            # Count observations
-            cursor.execute(
-                'SELECT COUNT(*) FROM disk_observations WHERE disk_registry_id = ? AND dismissed = 0',
-                (disk_id,))
-            obs_count = cursor.fetchone()[0]
-            
-            conn.close()
-            
-            if row:
-                return {
-                    'worst_health': row[0] or 'healthy',
-                    'worst_health_date': row[1],
-                    'admin_cleared': row[2],
-                    'observations_count': obs_count
-                }
-            return {'worst_health': 'healthy', 'observations_count': obs_count}
-        except Exception as e:
-            print(f"[HealthPersistence] Error getting disk health for {device_name}: {e}")
-            return {'worst_health': 'healthy', 'observations_count': 0}
-
-    def clear_disk_health_history(self, device_name: str, serial: Optional[str] = None) -> bool:
-        """Admin action: clear worst_health back to healthy.
-        
-        This resets the health status but keeps all observations for audit.
-        Records when the admin cleared it for accountability.
-        """
-        now = datetime.now().isoformat()
-        try:
-            conn = self._get_conn()
-            cursor = conn.cursor()
-            
-            disk_id = self._get_disk_registry_id(cursor, device_name, serial)
-            if not disk_id:
-                conn.close()
-                return False
-            
-            cursor.execute('''
-                UPDATE disk_registry 
-                SET worst_health = 'healthy', worst_health_date = NULL, admin_cleared = ?
-                WHERE id = ?
-            ''', (now, disk_id))
-            conn.commit()
-            conn.close()
-            return True
-        except Exception as e:
-            print(f"[HealthPersistence] Error clearing health for {device_name}: {e}")
-            return False
+    # NOTE: update_disk_worst_health, get_disk_health_status, clear_disk_health_history
+    # were removed. The disk health badge now shows the CURRENT status from Proxmox/SMART
+    # directly, not a persistent "worst_health". Historical observations are preserved
+    # in disk_observations table and shown separately via the "X obs." badge.

    def record_disk_observation(self, device_name: str, serial: Optional[str],
                                 error_type: str, error_signature: str,
@@ -2025,9 +1898,7 @@ class HealthPersistence:
            
            conn.commit()
            conn.close()
-            
-            # Update worst_health based on observation severity
-            self.update_disk_worst_health(clean_dev, serial, severity)
+            # Observation recorded - worst_health no longer updated (badge shows current SMART status)
            
        except Exception as e:
            print(f"[HealthPersistence] Error recording disk observation: {e}")
@@ -197,7 +197,7 @@ def capture_journal_context(keywords: list, lines: int = 30,
        return ""


-# ─── Journal Watcher (Real-time) ─────────────────────────────────
+# ─── Journal Watcher (Real-time) ────────────────────────────────

 class JournalWatcher:
    """Watches journald in real-time for critical system events.
@@ -964,10 +964,7 @@ class JournalWatcher:
                raw_message=raw_msg,
                severity='warning',
            )
-            
-            # Update worst_health for permanent tracking (record_disk_observation 
-            # already does this, but we ensure it here for safety)
-            health_persistence.update_disk_worst_health(base_dev, serial, 'warning')
+            # Observation recorded - worst_health no longer used (badge shows current SMART status)
            
        except Exception as e:
            print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}")
@@ -883,7 +883,7 @@ TEMPLATES = {
        'default_enabled': True,
    },
    
-    # ── ProxMenux updates ��─
+    # ── ProxMenux updates ──
    'proxmenux_update': {
        'title': '{hostname}: ProxMenux {new_version} available',
        'body': (
@@ -21,7 +21,7 @@ import time
 import threading
 from typing import Set, List, Tuple, Optional

-# ─── Configuration ───────────────────────���───────────────────────────────────
+# ─── Configuration ───────────────────────────────────────────────────────────

 # Grace period durations (seconds)
 STARTUP_VM_GRACE_SECONDS = 180      # 3 minutes for VM/CT start aggregation