diff --git a/AppImage/scripts/flask_health_routes.py b/AppImage/scripts/flask_health_routes.py index 3cc5df28..117319a1 100644 --- a/AppImage/scripts/flask_health_routes.py +++ b/AppImage/scripts/flask_health_routes.py @@ -458,7 +458,7 @@ def delete_storage_exclusion(storage_name): return jsonify({'error': str(e)}), 500 -# ═══════════════════════════════════════════════════════════════════════════ +# ══════════════════════════════════════════════════════════════════════════ # NETWORK INTERFACE EXCLUSION ROUTES # ═══════════════════════════════════════════════════════════════════════════ diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 93701343..3b20a12b 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -228,7 +228,6 @@ class HealthMonitor: def __init__(self): """Initialize health monitor with state tracking""" - print("[HealthMonitor] Version 2026-03-31-v2 - Stale resource cleanup enabled") self.state_history = defaultdict(list) self.last_check_times = {} self.cached_results = {} @@ -1218,10 +1217,7 @@ class HealthMonitor: 'dismissable': True, } ) - # Update worst_health for the disk (persists even if current error clears) - # Use serial for proper USB disk tracking - health_persistence.update_disk_worst_health(device, disk_serial if disk_serial else None, severity.lower()) - # Also register the disk for observation tracking + # Register the disk for observation tracking (worst_health no longer used) if disk_serial: health_persistence.register_disk(device, disk_serial, disk_model, 0) except Exception: @@ -1242,7 +1238,7 @@ class HealthMonitor: if disk_path not in storage_details or storage_details[disk_path].get('status') == 'OK': issues.append(f'{disk_path}: {disk_info.get("reason", "I/O errors")}') storage_details[disk_path] = disk_info - # Update worst_health for I/O errors + device = disk_path.replace('/dev/', '') io_severity = disk_info.get('status', 'WARNING').lower() @@ -1262,8 +1258,8 @@ class HealthMonitor: except Exception: pass + # Register the disk for observation tracking (worst_health no longer used) try: - health_persistence.update_disk_worst_health(device, io_serial if io_serial else None, io_severity) if io_serial: health_persistence.register_disk(device, io_serial, io_model, 0) except Exception: @@ -1459,24 +1455,10 @@ class HealthMonitor: serial = disk_info.get('serial', '') model = disk_info.get('model', '') - # Get worst_health from persistence + # Use current status directly from Proxmox/SMART - no persistent worst_health + # Historical observations are preserved separately in disk_observations table current_status = error_info.get('status', 'WARNING') - try: - health_status = health_persistence.get_disk_health_status(device_name, serial if serial else None) - worst_health = health_status.get('worst_health', 'healthy') - - # Final health = max(current, worst) - health_order = {'healthy': 0, 'ok': 0, 'warning': 1, 'critical': 2} - current_level = health_order.get(current_status.lower(), 1) - worst_level = health_order.get(worst_health.lower(), 0) - - if worst_level > current_level: - # worst_health is worse, use it - final_status = worst_health.upper() - else: - final_status = current_status - except Exception: - final_status = current_status + final_status = current_status # Build detail string with serial/model if available detail = error_info.get('detail', error_info.get('reason', 'Unknown error')) diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 131c5aff..8b6b60d0 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -868,7 +868,6 @@ class HealthPersistence: return self._cleanup_old_errors_impl() def _cleanup_old_errors_impl(self): - print("[HealthPersistence] Running cleanup_old_errors...") conn = self._get_conn() cursor = conn.cursor() @@ -963,14 +962,13 @@ class HealthPersistence: now_iso = now.isoformat() # Get all active (unresolved) errors with first_seen and last_seen for age checks + # An error is considered unresolved if resolution_type is NULL or empty + # (resolved_at alone is not sufficient - it may be in an inconsistent state) cursor.execute(''' SELECT id, error_key, category, reason, first_seen, last_seen, severity FROM errors - WHERE resolved_at IS NULL + WHERE resolution_type IS NULL OR resolution_type = '' ''') active_errors = cursor.fetchall() - - print(f"[HealthPersistence] _cleanup_stale_resources: Found {len(active_errors)} active errors to check") - resolved_count = 0 # Cache for expensive checks (avoid repeated subprocess calls) @@ -1086,13 +1084,9 @@ class HealthPersistence: # Check if VM/CT still exists (covers: vms/vmct categories, vm_*, ct_*, vmct_* error keys) if category in ('vms', 'vmct') or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_') or error_key.startswith('vmct_'))): vmid = extract_vmid_from_text(error_key) or extract_vmid_from_text(reason) - print(f"[HealthPersistence] Checking VM/CT error: key={error_key}, category={category}, vmid={vmid}") - if vmid: - exists = check_vm_ct_cached(vmid) - print(f"[HealthPersistence] VM/CT {vmid} exists: {exists}") - if not exists: - should_resolve = True - resolution_reason = 'VM/CT deleted' + if vmid and not check_vm_ct_cached(vmid): + should_resolve = True + resolution_reason = 'VM/CT deleted' # === DISK ERRORS === # Check if disk device or ZFS pool still exists @@ -1207,7 +1201,6 @@ class HealthPersistence: resolution_reason = 'Stale error (no activity >7d)' if should_resolve: - print(f"[HealthPersistence] Resolving error: {error_key} - {resolution_reason}") cursor.execute(''' UPDATE errors SET resolved_at = ?, resolution_type = 'auto', resolution_reason = ? WHERE id = ? @@ -1862,130 +1855,10 @@ class HealthPersistence: pass return None - def update_disk_worst_health(self, device_name: str, serial: Optional[str], - new_health: str) -> bool: - """Update worst_health if new_health is worse than current. - - Health hierarchy: healthy < warning < critical - Only escalates, never downgrades automatically. - - Returns True if worst_health was updated. - """ - health_order = {'healthy': 0, 'warning': 1, 'critical': 2} - new_level = health_order.get(new_health.lower(), 0) - - if new_level == 0: # healthy never updates worst_health - return False - - now = datetime.now().isoformat() - try: - conn = self._get_conn() - cursor = conn.cursor() - - disk_id = self._get_disk_registry_id(cursor, device_name, serial) - if not disk_id: - # Register disk first - self.register_disk(device_name.replace('/dev/', ''), serial) - disk_id = self._get_disk_registry_id(cursor, device_name, serial) - - if not disk_id: - conn.close() - return False - - # Get current worst_health - cursor.execute('SELECT worst_health FROM disk_registry WHERE id = ?', (disk_id,)) - row = cursor.fetchone() - current_worst = row[0] if row and row[0] else 'healthy' - current_level = health_order.get(current_worst.lower(), 0) - - # Only update if new health is worse - if new_level > current_level: - cursor.execute(''' - UPDATE disk_registry - SET worst_health = ?, worst_health_date = ?, admin_cleared = NULL - WHERE id = ? - ''', (new_health.lower(), now, disk_id)) - conn.commit() - conn.close() - return True - - conn.close() - return False - except Exception as e: - print(f"[HealthPersistence] Error updating worst_health for {device_name}: {e}") - return False - - def get_disk_health_status(self, device_name: str, serial: Optional[str] = None) -> Dict[str, Any]: - """Get the health status of a disk including worst_health. - - Returns dict with: - - worst_health: 'healthy', 'warning', or 'critical' - - worst_health_date: ISO timestamp when worst_health was set - - admin_cleared: ISO timestamp if admin manually cleared the health - - observations_count: Number of recorded observations - """ - try: - conn = self._get_conn() - cursor = conn.cursor() - - disk_id = self._get_disk_registry_id(cursor, device_name, serial) - if not disk_id: - conn.close() - return {'worst_health': 'healthy', 'observations_count': 0} - - cursor.execute(''' - SELECT worst_health, worst_health_date, admin_cleared - FROM disk_registry WHERE id = ? - ''', (disk_id,)) - row = cursor.fetchone() - - # Count observations - cursor.execute( - 'SELECT COUNT(*) FROM disk_observations WHERE disk_registry_id = ? AND dismissed = 0', - (disk_id,)) - obs_count = cursor.fetchone()[0] - - conn.close() - - if row: - return { - 'worst_health': row[0] or 'healthy', - 'worst_health_date': row[1], - 'admin_cleared': row[2], - 'observations_count': obs_count - } - return {'worst_health': 'healthy', 'observations_count': obs_count} - except Exception as e: - print(f"[HealthPersistence] Error getting disk health for {device_name}: {e}") - return {'worst_health': 'healthy', 'observations_count': 0} - - def clear_disk_health_history(self, device_name: str, serial: Optional[str] = None) -> bool: - """Admin action: clear worst_health back to healthy. - - This resets the health status but keeps all observations for audit. - Records when the admin cleared it for accountability. - """ - now = datetime.now().isoformat() - try: - conn = self._get_conn() - cursor = conn.cursor() - - disk_id = self._get_disk_registry_id(cursor, device_name, serial) - if not disk_id: - conn.close() - return False - - cursor.execute(''' - UPDATE disk_registry - SET worst_health = 'healthy', worst_health_date = NULL, admin_cleared = ? - WHERE id = ? - ''', (now, disk_id)) - conn.commit() - conn.close() - return True - except Exception as e: - print(f"[HealthPersistence] Error clearing health for {device_name}: {e}") - return False + # NOTE: update_disk_worst_health, get_disk_health_status, clear_disk_health_history + # were removed. The disk health badge now shows the CURRENT status from Proxmox/SMART + # directly, not a persistent "worst_health". Historical observations are preserved + # in disk_observations table and shown separately via the "X obs." badge. def record_disk_observation(self, device_name: str, serial: Optional[str], error_type: str, error_signature: str, @@ -2025,9 +1898,7 @@ class HealthPersistence: conn.commit() conn.close() - - # Update worst_health based on observation severity - self.update_disk_worst_health(clean_dev, serial, severity) + # Observation recorded - worst_health no longer updated (badge shows current SMART status) except Exception as e: print(f"[HealthPersistence] Error recording disk observation: {e}") diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index a7929d19..755d3483 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -197,7 +197,7 @@ def capture_journal_context(keywords: list, lines: int = 30, return "" -# ─── Journal Watcher (Real-time) ───────────────────────────────── +# ─── Journal Watcher (Real-time) ──────────────────────────────── class JournalWatcher: """Watches journald in real-time for critical system events. @@ -964,10 +964,7 @@ class JournalWatcher: raw_message=raw_msg, severity='warning', ) - - # Update worst_health for permanent tracking (record_disk_observation - # already does this, but we ensure it here for safety) - health_persistence.update_disk_worst_health(base_dev, serial, 'warning') + # Observation recorded - worst_health no longer used (badge shows current SMART status) except Exception as e: print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}") diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py index c246d17d..5119ae91 100644 --- a/AppImage/scripts/notification_templates.py +++ b/AppImage/scripts/notification_templates.py @@ -883,7 +883,7 @@ TEMPLATES = { 'default_enabled': True, }, - # ── ProxMenux updates ��─ + # ── ProxMenux updates ── 'proxmenux_update': { 'title': '{hostname}: ProxMenux {new_version} available', 'body': ( diff --git a/AppImage/scripts/startup_grace.py b/AppImage/scripts/startup_grace.py index eaf65ec2..33a92df8 100644 --- a/AppImage/scripts/startup_grace.py +++ b/AppImage/scripts/startup_grace.py @@ -21,7 +21,7 @@ import time import threading from typing import Set, List, Tuple, Optional -# ─── Configuration ───────────────────────���─────────────────────────────────── +# ─── Configuration ─────────────────────────────────────────────────────────── # Grace period durations (seconds) STARTUP_VM_GRACE_SECONDS = 180 # 3 minutes for VM/CT start aggregation