diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index e90b516e..257f2300 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -2611,6 +2611,28 @@ class HealthMonitor: continue return '' + def _vm_ct_exists(self, vmid: str) -> bool: + """Check if a VM or CT exists by verifying its config file.""" + import os + # Check VM config + vm_conf = f'/etc/pve/qemu-server/{vmid}.conf' + if os.path.exists(vm_conf): + return True + # Check CT config (local node and cluster nodes) + for base in ['/etc/pve/lxc', '/etc/pve/nodes']: + if base == '/etc/pve/lxc': + ct_conf = f'{base}/{vmid}.conf' + if os.path.exists(ct_conf): + return True + else: + # Check all cluster nodes + if os.path.isdir(base): + for node in os.listdir(base): + ct_conf = f'{base}/{node}/lxc/{vmid}.conf' + if os.path.exists(ct_conf): + return True + return False + def _check_vms_cts_optimized(self) -> Dict[str, Any]: """ Optimized VM/CT check - detects qmp failures and startup errors from logs. @@ -2648,6 +2670,9 @@ class HealthMonitor: if _vzdump_running: continue # Normal during backup vmid = vm_qmp_match.group(1) + # Skip if VM no longer exists (stale journal entry) + if not self._vm_ct_exists(vmid): + continue vm_name = self._resolve_vm_name(vmid) display = f"VM {vmid} ({vm_name})" if vm_name else f"VM {vmid}" key = f'vm_{vmid}' @@ -2665,6 +2690,9 @@ class HealthMonitor: ct_error_match = re.search(r'(?:ct|container|lxc)\s+(\d+)', line_lower) if ct_error_match and ('error' in line_lower or 'fail' in line_lower or 'device' in line_lower): ctid = ct_error_match.group(1) + # Skip if CT no longer exists (stale journal entry) + if not self._vm_ct_exists(ctid): + continue key = f'ct_{ctid}' if key not in vm_details: if 'device' in line_lower and 'does not exist' in line_lower: @@ -2694,6 +2722,9 @@ class HealthMonitor: vzstart_match = re.search(r'vzstart:(\d+):', line) if vzstart_match and ('error' in line_lower or 'fail' in line_lower or 'does not exist' in line_lower): ctid = vzstart_match.group(1) + # Skip if CT no longer exists (stale journal entry) + if not self._vm_ct_exists(ctid): + continue key = f'ct_{ctid}' if key not in vm_details: # Resolve CT name for better context @@ -2726,6 +2757,9 @@ class HealthMonitor: id_match = re.search(r'\b(\d{3,4})\b', line) if id_match: vmid = id_match.group(1) + # Skip if VM/CT no longer exists (stale journal entry) + if not self._vm_ct_exists(vmid): + continue key = f'vmct_{vmid}' if key not in vm_details: vm_name = self._resolve_vm_name(vmid) diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 132b9c2d..b82326b1 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -80,8 +80,14 @@ class HealthPersistence: def _init_database(self): """Initialize SQLite database with required tables""" - conn = self._get_conn() - cursor = conn.cursor() + try: + conn = self._get_conn() + cursor = conn.cursor() + except Exception as e: + print(f"[HealthPersistence] CRITICAL: Failed to connect to database: {e}") + return + + print(f"[HealthPersistence] Initializing database at {self.db_path}") # Errors table cursor.execute(''' @@ -271,6 +277,20 @@ class HealthPersistence: cursor.execute('CREATE INDEX IF NOT EXISTS idx_excluded_interface ON excluded_interfaces(interface_name)') conn.commit() + + # Verify all required tables exist + cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + tables = {row[0] for row in cursor.fetchall()} + required_tables = {'errors', 'events', 'system_capabilities', 'user_settings', + 'notification_history', 'notification_last_sent', + 'disk_registry', 'disk_observations', + 'excluded_storages', 'excluded_interfaces'} + missing = required_tables - tables + if missing: + print(f"[HealthPersistence] WARNING: Missing tables after init: {missing}") + else: + print(f"[HealthPersistence] Database initialized with {len(tables)} tables") + conn.close() def record_error(self, error_key: str, category: str, severity: str, @@ -283,6 +303,32 @@ class HealthPersistence: return self._record_error_impl(error_key, category, severity, reason, details) def _record_error_impl(self, error_key, category, severity, reason, details): + # === RESOURCE EXISTENCE CHECK === + # Skip recording errors for resources that no longer exist + # This prevents "ghost" errors from stale journal entries + + # Check VM/CT existence + if error_key and (error_key.startswith(('vm_', 'ct_', 'vmct_'))): + import re + vmid_match = re.search(r'(?:vm_|ct_|vmct_)(\d+)', error_key) + if vmid_match: + vmid = vmid_match.group(1) + if not self._check_vm_ct_exists(vmid): + return {'type': 'skipped', 'needs_notification': False, + 'reason': f'VM/CT {vmid} no longer exists'} + + # Check disk existence + if error_key and any(error_key.startswith(p) for p in ('smart_', 'disk_', 'io_error_')): + import re + import os + disk_match = re.search(r'(?:smart_|disk_fs_|disk_|io_error_)(?:/dev/)?([a-z]{2,4}[a-z0-9]*)', error_key) + if disk_match: + disk_name = disk_match.group(1) + base_disk = re.sub(r'\d+$', '', disk_name) if disk_name[-1].isdigit() else disk_name + if not os.path.exists(f'/dev/{disk_name}') and not os.path.exists(f'/dev/{base_disk}'): + return {'type': 'skipped', 'needs_notification': False, + 'reason': f'Disk /dev/{disk_name} no longer exists'} + conn = self._get_conn() cursor = conn.cursor() @@ -1030,8 +1076,8 @@ class HealthPersistence: last_seen_hours = get_age_hours(last_seen) # === VM/CT ERRORS === - # Check if VM/CT still exists (covers: vms category, vm_*, ct_* error keys) - if category == 'vms' or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_'))): + # Check if VM/CT still exists (covers: vms/vmct categories, vm_*, ct_*, vmct_* error keys) + if category in ('vms', 'vmct') or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_') or error_key.startswith('vmct_'))): vmid = extract_vmid_from_text(error_key) or extract_vmid_from_text(reason) if vmid and not check_vm_ct_cached(vmid): should_resolve = True @@ -1050,13 +1096,17 @@ class HealthPersistence: should_resolve = True resolution_reason = 'ZFS pool removed' - # Check for disk device errors (e.g., "disk_sdh_io_error", "smart_sda_failing") + # Check for disk device errors (e.g., "disk_sdh_io_error", "smart_sda_failing", "disk_fs_sdb1") if not should_resolve: - disk_match = re.search(r'(?:disk_|smart_|io_error_)([a-z]{2,4}\d*)', error_key) + # Match patterns like: smart_sda, disk_sdb, io_error_nvme0n1, disk_fs_sdb1 + disk_match = re.search(r'(?:disk_fs_|disk_|smart_|io_error_)(?:/dev/)?([a-z]{2,4}[a-z0-9]*)', error_key) if disk_match: disk_name = disk_match.group(1) + # Remove partition number for base device check + base_disk = re.sub(r'\d+$', '', disk_name) if disk_name[-1].isdigit() else disk_name disk_path = f'/dev/{disk_name}' - if not os.path.exists(disk_path): + base_path = f'/dev/{base_disk}' + if not os.path.exists(disk_path) and not os.path.exists(base_path): should_resolve = True resolution_reason = 'Disk device removed' @@ -1113,7 +1163,13 @@ class HealthPersistence: # === CLUSTER ERRORS === # Resolve cluster/corosync/qdevice errors if node is no longer in a cluster - elif error_key and any(x in error_key.lower() for x in ('cluster', 'corosync', 'qdevice', 'quorum')): + # Check both error_key and reason for cluster-related keywords + cluster_keywords = ('cluster', 'corosync', 'qdevice', 'quorum', 'cman', 'pacemaker') + is_cluster_error = ( + (error_key and any(x in error_key.lower() for x in cluster_keywords)) or + (reason and any(x in reason.lower() for x in cluster_keywords)) + ) + if is_cluster_error: cluster_info = get_cluster_status() if not cluster_info['is_cluster']: should_resolve = True