diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index ef381192..2701019a 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -821,8 +821,20 @@ class HealthMonitor: issues = [] storage_details = {} - # Check disk usage and mount status first for critical mounts - critical_mounts = ['/'] + # Check disk usage and mount status for important mounts. + # We detect actual mountpoints dynamically rather than hard-coding. + critical_mounts = set() + critical_mounts.add('/') + try: + for part in psutil.disk_partitions(all=False): + mp = part.mountpoint + # Include standard system mounts and PVE storage + if mp in ('/', '/var', '/tmp', '/boot', '/boot/efi') or \ + mp.startswith('/var/lib/vz') or mp.startswith('/mnt/'): + critical_mounts.add(mp) + except Exception: + pass + critical_mounts = sorted(critical_mounts) for mount_point in critical_mounts: try: @@ -857,9 +869,32 @@ class HealthMonitor: # Check filesystem usage only if not already flagged as critical if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK': fs_status = self._check_filesystem(mount_point) + error_key = f'disk_space_{mount_point}' if fs_status['status'] != 'OK': issues.append(f"{mount_point}: {fs_status['reason']}") storage_details[mount_point] = fs_status + # Record persistent error for notifications + usage = psutil.disk_usage(mount_point) + avail_gb = usage.free / (1024**3) + if avail_gb >= 1: + avail_str = f"{avail_gb:.1f} GiB" + else: + avail_str = f"{usage.free / (1024**2):.0f} MiB" + health_persistence.record_error( + error_key=error_key, + category='disk', + severity=fs_status['status'], + reason=f'{mount_point}: {fs_status["reason"]}', + details={ + 'mount': mount_point, + 'used': str(round(usage.percent, 1)), + 'available': avail_str, + 'dismissable': False, + } + ) + else: + # Space recovered -- clear any previous alert + health_persistence.clear_error(error_key) except Exception: pass # Silently skip if mountpoint check fails @@ -1871,7 +1906,8 @@ class HealthMonitor: self.persistent_log_patterns[pattern] = { 'count': 1, 'first_seen': current_time, - 'last_seen': current_time + 'last_seen': current_time, + 'sample': line.strip()[:200], # Original line for display } for line in previous_lines: @@ -1913,12 +1949,16 @@ class HealthMonitor: pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8] error_key = f'log_persistent_{pattern_hash}' if not health_persistence.is_error_active(error_key, category='logs'): + # Use the original sample line for the notification, + # not the normalized pattern (which has IDs replaced). + sample = data.get('sample', pattern) health_persistence.record_error( error_key=error_key, category='logs', severity='WARNING', - reason=f'Persistent error pattern detected: {pattern[:80]}', - details={'pattern': pattern, 'dismissable': True, 'occurrences': data['count']} + reason=f'Recurring error ({data["count"]}x): {sample[:150]}', + details={'pattern': pattern, 'sample': sample, + 'dismissable': True, 'occurrences': data['count']} ) patterns_to_remove = [ diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index 7837121b..ddbe1733 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -249,6 +249,23 @@ class JournalWatcher: def _check_kernel_critical(self, msg: str, syslog_id: str, priority: int): """Detect kernel panics, OOM, segfaults, hardware errors.""" + # Only process messages from kernel or systemd (not app-level logs) + if syslog_id and syslog_id not in ('kernel', 'systemd', 'systemd-coredump', ''): + return + + # Filter out normal kernel messages that are NOT problems + _KERNEL_NOISE = [ + r'vfio-pci\s+\S+:\s*reset', # PCI passthrough resets (normal during VM start/stop) + r'vfio-pci\s+\S+:\s*resetting', + r'entered\s+(?:promiscuous|allmulticast)\s+mode', # Network bridge ops + r'entered\s+(?:blocking|forwarding|disabled)\s+state', # Bridge STP + r'tap\d+i\d+:', # TAP interface events + r'vmbr\d+:.*port\s+\d+', # Bridge port events + ] + for noise in _KERNEL_NOISE: + if re.search(noise, msg, re.IGNORECASE): + return + critical_patterns = { r'kernel panic': ('system_problem', 'CRITICAL', 'Kernel panic'), r'Out of memory': ('system_problem', 'CRITICAL', 'Out of memory killer activated'), @@ -318,6 +335,19 @@ class JournalWatcher: def _check_service_failure(self, msg: str, unit: str): """Detect critical service failures with enriched context.""" + # Filter out noise -- these are normal systemd transient units, + # not real service failures worth alerting about. + _NOISE_PATTERNS = [ + r'session-\d+\.scope', # SSH/login sessions + r'user@\d+\.service', # Per-user service managers + r'user-runtime-dir@\d+', # User runtime dirs + r'systemd-coredump@', # Coredump handlers (transient) + r'run-.*\.mount', # Transient mounts + ] + for noise in _NOISE_PATTERNS: + if re.search(noise, msg) or re.search(noise, unit): + return + service_patterns = [ r'Failed to start (.+)', r'Unit (\S+) (?:entered failed state|failed)', @@ -743,13 +773,16 @@ class PollingCollector: 'load': 'load_high', 'temperature': 'temp_high', 'disk': 'disk_space_low', - 'storage': 'disk_space_low', + 'storage': 'storage_unavailable', 'network': 'network_down', 'pve_services': 'service_fail', 'security': 'auth_fail', 'updates': 'update_available', 'zfs': 'disk_io_error', 'smart': 'disk_io_error', + 'disks': 'disk_io_error', + 'logs': 'system_problem', + 'vms': 'system_problem', } def __init__(self, event_queue: Queue, poll_interval: int = 60): diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py index 9c2c41d9..334291de 100644 --- a/AppImage/scripts/notification_templates.py +++ b/AppImage/scripts/notification_templates.py @@ -25,10 +25,10 @@ from typing import Dict, Any, Optional, List def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]: """Parse a PVE vzdump notification message into structured data. - PVE vzdump messages contain: - - A table: VMID Name Status Time Size Filename - - Totals: Total running time: Xs / Total size: X GiB - - Full logs per VM + Supports two formats: + 1. Local storage: table with columns VMID Name Status Time Size Filename + 2. PBS storage: log-style output with 'Finished Backup of VM NNN (HH:MM:SS)' + and sizes in lines like 'root.pxar: had to backup X of Y' or 'transferred X' Returns dict with 'vms' list, 'total_time', 'total_size', or None. """ @@ -41,7 +41,7 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]: lines = message.split('\n') - # Find the table header line + # ── Strategy 1: classic table (local/NFS/CIFS storage) ── header_idx = -1 for i, line in enumerate(lines): if re.match(r'\s*VMID\s+Name\s+Status', line, re.IGNORECASE): @@ -49,15 +49,10 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]: break if header_idx >= 0: - # Parse column positions from header - header = lines[header_idx] - # Parse table rows after header for line in lines[header_idx + 1:]: stripped = line.strip() if not stripped or stripped.startswith('Total') or stripped.startswith('Logs') or stripped.startswith('='): break - # Table row: VMID Name Status Time Size Filename - # Use regex to parse flexible whitespace columns m = re.match( r'\s*(\d+)\s+' # VMID r'(\S+)\s+' # Name @@ -74,10 +69,91 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]: 'status': m.group(3), 'time': m.group(4), 'size': m.group(5), - 'filename': m.group(6).split('/')[-1], # just filename + 'filename': m.group(6).split('/')[-1], }) - # Extract totals + # ── Strategy 2: log-style (PBS / Proxmox Backup Server) ── + # Parse from the full vzdump log lines. + # Look for patterns: + # "Starting Backup of VM NNN (lxc/qemu)" -> detect guest + # "CT Name: xxx" or "VM Name: xxx" -> guest name + # "Finished Backup of VM NNN (HH:MM:SS)" -> duration + status=ok + # "root.pxar: had to backup X of Y" -> size (CT) + # "transferred X in N seconds" -> size (QEMU) + # "creating ... archive 'ct/100/2026-..'" -> archive name for PBS + # "TASK ERROR:" or "ERROR:" -> status=error + if not vms: + current_vm: Optional[Dict[str, str]] = None + + for line in lines: + # Remove "INFO: " prefix that PVE adds + clean = re.sub(r'^(?:INFO|WARNING|ERROR):\s*', '', line.strip()) + + # Start of a new VM backup + m_start = re.match( + r'Starting Backup of VM (\d+)\s+\((lxc|qemu)\)', clean) + if m_start: + if current_vm: + vms.append(current_vm) + current_vm = { + 'vmid': m_start.group(1), + 'name': '', + 'status': 'ok', + 'time': '', + 'size': '', + 'filename': '', + 'type': m_start.group(2), + } + continue + + if current_vm: + # Guest name + m_name = re.match(r'(?:CT|VM) Name:\s*(.+)', clean) + if m_name: + current_vm['name'] = m_name.group(1).strip() + continue + + # PBS archive path -> extract as filename + m_archive = re.search( + r"creating .+ archive '([^']+)'", clean) + if m_archive: + current_vm['filename'] = m_archive.group(1) + continue + + # Size for containers (pxar) + m_pxar = re.search( + r'root\.pxar:.*?of\s+([\d.]+\s+\S+)', clean) + if m_pxar: + current_vm['size'] = m_pxar.group(1) + continue + + # Size for QEMU (transferred) + m_transfer = re.search( + r'transferred\s+([\d.]+\s+\S+)', clean) + if m_transfer: + current_vm['size'] = m_transfer.group(1) + continue + + # Finished -> duration + m_finish = re.match( + r'Finished Backup of VM (\d+)\s+\(([^)]+)\)', clean) + if m_finish: + current_vm['time'] = m_finish.group(2) + current_vm['status'] = 'ok' + vms.append(current_vm) + current_vm = None + continue + + # Error + if clean.startswith('ERROR:') or clean.startswith('TASK ERROR'): + if current_vm: + current_vm['status'] = 'error' + + # Don't forget the last VM if it wasn't finished + if current_vm: + vms.append(current_vm) + + # ── Extract totals ── for line in lines: m_time = re.search(r'Total running time:\s*(.+)', line) if m_time: @@ -86,6 +162,50 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]: if m_size: total_size = m_size.group(1).strip() + # For PBS: calculate total size if not explicitly stated + if not total_size and vms: + # Sum individual sizes if they share units + sizes_gib = 0.0 + for vm in vms: + s = vm.get('size', '') + m = re.match(r'([\d.]+)\s+(.*)', s) + if m: + val = float(m.group(1)) + unit = m.group(2).strip().upper() + if 'GIB' in unit or 'GB' in unit: + sizes_gib += val + elif 'MIB' in unit or 'MB' in unit: + sizes_gib += val / 1024 + elif 'TIB' in unit or 'TB' in unit: + sizes_gib += val * 1024 + if sizes_gib > 0: + if sizes_gib >= 1024: + total_size = f"{sizes_gib / 1024:.3f} TiB" + elif sizes_gib >= 1: + total_size = f"{sizes_gib:.3f} GiB" + else: + total_size = f"{sizes_gib * 1024:.3f} MiB" + + # For PBS: calculate total time if not stated + if not total_time and vms: + total_secs = 0 + for vm in vms: + t = vm.get('time', '') + # Parse HH:MM:SS format + m = re.match(r'(\d+):(\d+):(\d+)', t) + if m: + total_secs += int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3)) + if total_secs > 0: + hours = total_secs // 3600 + mins = (total_secs % 3600) // 60 + secs = total_secs % 60 + if hours: + total_time = f"{hours}h {mins}m {secs}s" + elif mins: + total_time = f"{mins}m {secs}s" + else: + total_time = f"{secs}s" + if not vms and not total_size: return None @@ -113,7 +233,12 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str: if vm.get('time'): details.append(f"Duration: {vm['time']}") if vm.get('filename'): - details.append(f"File: {vm['filename']}") + fname = vm['filename'] + # PBS archives look like "ct/100/2026-..." or "vm/105/2026-..." + if re.match(r'^(?:ct|vm)/\d+/', fname): + details.append(f"PBS: {fname}") + else: + details.append(f"File: {fname}") if details: parts.append(' | '.join(details)) parts.append('') # blank line between VMs @@ -338,6 +463,12 @@ TEMPLATES = { 'group': 'storage', 'default_enabled': True, }, + 'storage_unavailable': { + 'title': '{hostname}: Storage unavailable - {storage_name}', + 'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}', + 'group': 'storage', + 'default_enabled': True, + }, 'load_high': { 'title': '{hostname}: High system load ({value})', 'body': 'System load average: {value} on {cores} cores.\n{details}',