diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 2701019a..ad2f5fa7 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -1386,6 +1386,32 @@ class HealthMonitor: except Exception: return {'status': 'UNKNOWN', 'reason': 'Ping command failed'} + def _is_vzdump_active(self) -> bool: + """Check if a vzdump (backup) job is currently running.""" + try: + with open('/var/log/pve/tasks/active', 'r') as f: + for line in f: + if ':vzdump:' in line: + return True + except (OSError, IOError): + pass + return False + + def _resolve_vm_name(self, vmid: str) -> str: + """Resolve VMID to guest name from PVE config files.""" + if not vmid: + return '' + for base in ['/etc/pve/qemu-server', '/etc/pve/lxc']: + conf = os.path.join(base, f'{vmid}.conf') + try: + with open(conf) as f: + for line in f: + if line.startswith('hostname:') or line.startswith('name:'): + return line.split(':', 1)[1].strip() + except (OSError, IOError): + continue + return '' + def _check_vms_cts_optimized(self) -> Dict[str, Any]: """ Optimized VM/CT check - detects qmp failures and startup errors from logs. @@ -1402,20 +1428,28 @@ class HealthMonitor: timeout=3 ) + # Check if vzdump is running -- QMP timeouts during backup are normal + _vzdump_running = self._is_vzdump_active() + if result.returncode == 0: for line in result.stdout.split('\n'): line_lower = line.lower() vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower) if vm_qmp_match: + if _vzdump_running: + continue # Normal during backup vmid = vm_qmp_match.group(1) + vm_name = self._resolve_vm_name(vmid) + display = f"VM {vmid} ({vm_name})" if vm_name else f"VM {vmid}" key = f'vm_{vmid}' if key not in vm_details: - issues.append(f'VM {vmid}: Communication issue') + issues.append(f'{display}: QMP communication issue') vm_details[key] = { 'status': 'WARNING', - 'reason': 'QMP command timeout', + 'reason': f'{display}: QMP command failed or timed out.\n{line.strip()[:200]}', 'id': vmid, + 'vmname': vm_name, 'type': 'VM' } continue @@ -1539,29 +1573,35 @@ class HealthMonitor: timeout=3 ) + _vzdump_running = self._is_vzdump_active() + if result.returncode == 0: for line in result.stdout.split('\n'): line_lower = line.lower() - # VM QMP errors + # VM QMP errors (skip during active backup -- normal behavior) vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower) if vm_qmp_match: + if _vzdump_running: + continue # Normal during backup vmid = vm_qmp_match.group(1) + vm_name = self._resolve_vm_name(vmid) + display = f"VM {vmid} ({vm_name})" if vm_name else f"VM {vmid}" error_key = f'vm_{vmid}' if error_key not in vm_details: - # Record persistent error health_persistence.record_error( error_key=error_key, category='vms', severity='WARNING', - reason='QMP command timeout', - details={'id': vmid, 'type': 'VM'} + reason=f'{display}: QMP command failed or timed out.\n{line.strip()[:200]}', + details={'id': vmid, 'vmname': vm_name, 'type': 'VM'} ) - issues.append(f'VM {vmid}: Communication issue') + issues.append(f'{display}: QMP communication issue') vm_details[error_key] = { 'status': 'WARNING', - 'reason': 'QMP command timeout', + 'reason': f'{display}: QMP command failed or timed out', 'id': vmid, + 'vmname': vm_name, 'type': 'VM' } continue diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index 4a0b40bf..4adb63a2 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -598,11 +598,17 @@ class TaskWatcher: """Check if a vzdump (backup) job is currently running. Reads /var/log/pve/tasks/active which lists all running PVE tasks. + Also verifies the process is actually alive (PID check). Result is cached for a few seconds to avoid excessive file reads. """ now = time.time() + # Negative cache: if we recently confirmed NO vzdump, skip the check + if hasattr(self, '_vzdump_negative_cache') and \ + now - self._vzdump_negative_cache < self._vzdump_cache_ttl: + return False + # Positive cache if now - self._vzdump_active_cache < self._vzdump_cache_ttl: - return True # Recently confirmed active + return True active_file = '/var/log/pve/tasks/active' try: @@ -610,11 +616,20 @@ class TaskWatcher: for line in f: # UPID format: UPID:node:pid:pstart:starttime:type:id:user: if ':vzdump:' in line: - self._vzdump_active_cache = now - return True + # Verify the PID is still alive + parts = line.strip().split(':') + if len(parts) >= 3: + try: + pid = int(parts[2]) + os.kill(pid, 0) # Signal 0 = just check existence + self._vzdump_active_cache = now + return True + except (ValueError, ProcessLookupError, PermissionError): + pass # PID not found or not a number -- stale entry except (OSError, IOError): pass + self._vzdump_negative_cache = now return False def _watch_loop(self):