mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-25 04:50:38 +00:00
Update notification service
This commit is contained in:
@@ -821,8 +821,20 @@ class HealthMonitor:
|
||||
issues = []
|
||||
storage_details = {}
|
||||
|
||||
# Check disk usage and mount status first for critical mounts
|
||||
critical_mounts = ['/']
|
||||
# Check disk usage and mount status for important mounts.
|
||||
# We detect actual mountpoints dynamically rather than hard-coding.
|
||||
critical_mounts = set()
|
||||
critical_mounts.add('/')
|
||||
try:
|
||||
for part in psutil.disk_partitions(all=False):
|
||||
mp = part.mountpoint
|
||||
# Include standard system mounts and PVE storage
|
||||
if mp in ('/', '/var', '/tmp', '/boot', '/boot/efi') or \
|
||||
mp.startswith('/var/lib/vz') or mp.startswith('/mnt/'):
|
||||
critical_mounts.add(mp)
|
||||
except Exception:
|
||||
pass
|
||||
critical_mounts = sorted(critical_mounts)
|
||||
|
||||
for mount_point in critical_mounts:
|
||||
try:
|
||||
@@ -857,9 +869,32 @@ class HealthMonitor:
|
||||
# Check filesystem usage only if not already flagged as critical
|
||||
if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK':
|
||||
fs_status = self._check_filesystem(mount_point)
|
||||
error_key = f'disk_space_{mount_point}'
|
||||
if fs_status['status'] != 'OK':
|
||||
issues.append(f"{mount_point}: {fs_status['reason']}")
|
||||
storage_details[mount_point] = fs_status
|
||||
# Record persistent error for notifications
|
||||
usage = psutil.disk_usage(mount_point)
|
||||
avail_gb = usage.free / (1024**3)
|
||||
if avail_gb >= 1:
|
||||
avail_str = f"{avail_gb:.1f} GiB"
|
||||
else:
|
||||
avail_str = f"{usage.free / (1024**2):.0f} MiB"
|
||||
health_persistence.record_error(
|
||||
error_key=error_key,
|
||||
category='disk',
|
||||
severity=fs_status['status'],
|
||||
reason=f'{mount_point}: {fs_status["reason"]}',
|
||||
details={
|
||||
'mount': mount_point,
|
||||
'used': str(round(usage.percent, 1)),
|
||||
'available': avail_str,
|
||||
'dismissable': False,
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Space recovered -- clear any previous alert
|
||||
health_persistence.clear_error(error_key)
|
||||
except Exception:
|
||||
pass # Silently skip if mountpoint check fails
|
||||
|
||||
@@ -1871,7 +1906,8 @@ class HealthMonitor:
|
||||
self.persistent_log_patterns[pattern] = {
|
||||
'count': 1,
|
||||
'first_seen': current_time,
|
||||
'last_seen': current_time
|
||||
'last_seen': current_time,
|
||||
'sample': line.strip()[:200], # Original line for display
|
||||
}
|
||||
|
||||
for line in previous_lines:
|
||||
@@ -1913,12 +1949,16 @@ class HealthMonitor:
|
||||
pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8]
|
||||
error_key = f'log_persistent_{pattern_hash}'
|
||||
if not health_persistence.is_error_active(error_key, category='logs'):
|
||||
# Use the original sample line for the notification,
|
||||
# not the normalized pattern (which has IDs replaced).
|
||||
sample = data.get('sample', pattern)
|
||||
health_persistence.record_error(
|
||||
error_key=error_key,
|
||||
category='logs',
|
||||
severity='WARNING',
|
||||
reason=f'Persistent error pattern detected: {pattern[:80]}',
|
||||
details={'pattern': pattern, 'dismissable': True, 'occurrences': data['count']}
|
||||
reason=f'Recurring error ({data["count"]}x): {sample[:150]}',
|
||||
details={'pattern': pattern, 'sample': sample,
|
||||
'dismissable': True, 'occurrences': data['count']}
|
||||
)
|
||||
|
||||
patterns_to_remove = [
|
||||
|
||||
@@ -249,6 +249,23 @@ class JournalWatcher:
|
||||
|
||||
def _check_kernel_critical(self, msg: str, syslog_id: str, priority: int):
|
||||
"""Detect kernel panics, OOM, segfaults, hardware errors."""
|
||||
# Only process messages from kernel or systemd (not app-level logs)
|
||||
if syslog_id and syslog_id not in ('kernel', 'systemd', 'systemd-coredump', ''):
|
||||
return
|
||||
|
||||
# Filter out normal kernel messages that are NOT problems
|
||||
_KERNEL_NOISE = [
|
||||
r'vfio-pci\s+\S+:\s*reset', # PCI passthrough resets (normal during VM start/stop)
|
||||
r'vfio-pci\s+\S+:\s*resetting',
|
||||
r'entered\s+(?:promiscuous|allmulticast)\s+mode', # Network bridge ops
|
||||
r'entered\s+(?:blocking|forwarding|disabled)\s+state', # Bridge STP
|
||||
r'tap\d+i\d+:', # TAP interface events
|
||||
r'vmbr\d+:.*port\s+\d+', # Bridge port events
|
||||
]
|
||||
for noise in _KERNEL_NOISE:
|
||||
if re.search(noise, msg, re.IGNORECASE):
|
||||
return
|
||||
|
||||
critical_patterns = {
|
||||
r'kernel panic': ('system_problem', 'CRITICAL', 'Kernel panic'),
|
||||
r'Out of memory': ('system_problem', 'CRITICAL', 'Out of memory killer activated'),
|
||||
@@ -318,6 +335,19 @@ class JournalWatcher:
|
||||
|
||||
def _check_service_failure(self, msg: str, unit: str):
|
||||
"""Detect critical service failures with enriched context."""
|
||||
# Filter out noise -- these are normal systemd transient units,
|
||||
# not real service failures worth alerting about.
|
||||
_NOISE_PATTERNS = [
|
||||
r'session-\d+\.scope', # SSH/login sessions
|
||||
r'user@\d+\.service', # Per-user service managers
|
||||
r'user-runtime-dir@\d+', # User runtime dirs
|
||||
r'systemd-coredump@', # Coredump handlers (transient)
|
||||
r'run-.*\.mount', # Transient mounts
|
||||
]
|
||||
for noise in _NOISE_PATTERNS:
|
||||
if re.search(noise, msg) or re.search(noise, unit):
|
||||
return
|
||||
|
||||
service_patterns = [
|
||||
r'Failed to start (.+)',
|
||||
r'Unit (\S+) (?:entered failed state|failed)',
|
||||
@@ -743,13 +773,16 @@ class PollingCollector:
|
||||
'load': 'load_high',
|
||||
'temperature': 'temp_high',
|
||||
'disk': 'disk_space_low',
|
||||
'storage': 'disk_space_low',
|
||||
'storage': 'storage_unavailable',
|
||||
'network': 'network_down',
|
||||
'pve_services': 'service_fail',
|
||||
'security': 'auth_fail',
|
||||
'updates': 'update_available',
|
||||
'zfs': 'disk_io_error',
|
||||
'smart': 'disk_io_error',
|
||||
'disks': 'disk_io_error',
|
||||
'logs': 'system_problem',
|
||||
'vms': 'system_problem',
|
||||
}
|
||||
|
||||
def __init__(self, event_queue: Queue, poll_interval: int = 60):
|
||||
|
||||
@@ -25,10 +25,10 @@ from typing import Dict, Any, Optional, List
|
||||
def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
|
||||
"""Parse a PVE vzdump notification message into structured data.
|
||||
|
||||
PVE vzdump messages contain:
|
||||
- A table: VMID Name Status Time Size Filename
|
||||
- Totals: Total running time: Xs / Total size: X GiB
|
||||
- Full logs per VM
|
||||
Supports two formats:
|
||||
1. Local storage: table with columns VMID Name Status Time Size Filename
|
||||
2. PBS storage: log-style output with 'Finished Backup of VM NNN (HH:MM:SS)'
|
||||
and sizes in lines like 'root.pxar: had to backup X of Y' or 'transferred X'
|
||||
|
||||
Returns dict with 'vms' list, 'total_time', 'total_size', or None.
|
||||
"""
|
||||
@@ -41,7 +41,7 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
|
||||
|
||||
lines = message.split('\n')
|
||||
|
||||
# Find the table header line
|
||||
# ── Strategy 1: classic table (local/NFS/CIFS storage) ──
|
||||
header_idx = -1
|
||||
for i, line in enumerate(lines):
|
||||
if re.match(r'\s*VMID\s+Name\s+Status', line, re.IGNORECASE):
|
||||
@@ -49,15 +49,10 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
|
||||
break
|
||||
|
||||
if header_idx >= 0:
|
||||
# Parse column positions from header
|
||||
header = lines[header_idx]
|
||||
# Parse table rows after header
|
||||
for line in lines[header_idx + 1:]:
|
||||
stripped = line.strip()
|
||||
if not stripped or stripped.startswith('Total') or stripped.startswith('Logs') or stripped.startswith('='):
|
||||
break
|
||||
# Table row: VMID Name Status Time Size Filename
|
||||
# Use regex to parse flexible whitespace columns
|
||||
m = re.match(
|
||||
r'\s*(\d+)\s+' # VMID
|
||||
r'(\S+)\s+' # Name
|
||||
@@ -74,10 +69,91 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
|
||||
'status': m.group(3),
|
||||
'time': m.group(4),
|
||||
'size': m.group(5),
|
||||
'filename': m.group(6).split('/')[-1], # just filename
|
||||
'filename': m.group(6).split('/')[-1],
|
||||
})
|
||||
|
||||
# Extract totals
|
||||
# ── Strategy 2: log-style (PBS / Proxmox Backup Server) ──
|
||||
# Parse from the full vzdump log lines.
|
||||
# Look for patterns:
|
||||
# "Starting Backup of VM NNN (lxc/qemu)" -> detect guest
|
||||
# "CT Name: xxx" or "VM Name: xxx" -> guest name
|
||||
# "Finished Backup of VM NNN (HH:MM:SS)" -> duration + status=ok
|
||||
# "root.pxar: had to backup X of Y" -> size (CT)
|
||||
# "transferred X in N seconds" -> size (QEMU)
|
||||
# "creating ... archive 'ct/100/2026-..'" -> archive name for PBS
|
||||
# "TASK ERROR:" or "ERROR:" -> status=error
|
||||
if not vms:
|
||||
current_vm: Optional[Dict[str, str]] = None
|
||||
|
||||
for line in lines:
|
||||
# Remove "INFO: " prefix that PVE adds
|
||||
clean = re.sub(r'^(?:INFO|WARNING|ERROR):\s*', '', line.strip())
|
||||
|
||||
# Start of a new VM backup
|
||||
m_start = re.match(
|
||||
r'Starting Backup of VM (\d+)\s+\((lxc|qemu)\)', clean)
|
||||
if m_start:
|
||||
if current_vm:
|
||||
vms.append(current_vm)
|
||||
current_vm = {
|
||||
'vmid': m_start.group(1),
|
||||
'name': '',
|
||||
'status': 'ok',
|
||||
'time': '',
|
||||
'size': '',
|
||||
'filename': '',
|
||||
'type': m_start.group(2),
|
||||
}
|
||||
continue
|
||||
|
||||
if current_vm:
|
||||
# Guest name
|
||||
m_name = re.match(r'(?:CT|VM) Name:\s*(.+)', clean)
|
||||
if m_name:
|
||||
current_vm['name'] = m_name.group(1).strip()
|
||||
continue
|
||||
|
||||
# PBS archive path -> extract as filename
|
||||
m_archive = re.search(
|
||||
r"creating .+ archive '([^']+)'", clean)
|
||||
if m_archive:
|
||||
current_vm['filename'] = m_archive.group(1)
|
||||
continue
|
||||
|
||||
# Size for containers (pxar)
|
||||
m_pxar = re.search(
|
||||
r'root\.pxar:.*?of\s+([\d.]+\s+\S+)', clean)
|
||||
if m_pxar:
|
||||
current_vm['size'] = m_pxar.group(1)
|
||||
continue
|
||||
|
||||
# Size for QEMU (transferred)
|
||||
m_transfer = re.search(
|
||||
r'transferred\s+([\d.]+\s+\S+)', clean)
|
||||
if m_transfer:
|
||||
current_vm['size'] = m_transfer.group(1)
|
||||
continue
|
||||
|
||||
# Finished -> duration
|
||||
m_finish = re.match(
|
||||
r'Finished Backup of VM (\d+)\s+\(([^)]+)\)', clean)
|
||||
if m_finish:
|
||||
current_vm['time'] = m_finish.group(2)
|
||||
current_vm['status'] = 'ok'
|
||||
vms.append(current_vm)
|
||||
current_vm = None
|
||||
continue
|
||||
|
||||
# Error
|
||||
if clean.startswith('ERROR:') or clean.startswith('TASK ERROR'):
|
||||
if current_vm:
|
||||
current_vm['status'] = 'error'
|
||||
|
||||
# Don't forget the last VM if it wasn't finished
|
||||
if current_vm:
|
||||
vms.append(current_vm)
|
||||
|
||||
# ── Extract totals ──
|
||||
for line in lines:
|
||||
m_time = re.search(r'Total running time:\s*(.+)', line)
|
||||
if m_time:
|
||||
@@ -86,6 +162,50 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
|
||||
if m_size:
|
||||
total_size = m_size.group(1).strip()
|
||||
|
||||
# For PBS: calculate total size if not explicitly stated
|
||||
if not total_size and vms:
|
||||
# Sum individual sizes if they share units
|
||||
sizes_gib = 0.0
|
||||
for vm in vms:
|
||||
s = vm.get('size', '')
|
||||
m = re.match(r'([\d.]+)\s+(.*)', s)
|
||||
if m:
|
||||
val = float(m.group(1))
|
||||
unit = m.group(2).strip().upper()
|
||||
if 'GIB' in unit or 'GB' in unit:
|
||||
sizes_gib += val
|
||||
elif 'MIB' in unit or 'MB' in unit:
|
||||
sizes_gib += val / 1024
|
||||
elif 'TIB' in unit or 'TB' in unit:
|
||||
sizes_gib += val * 1024
|
||||
if sizes_gib > 0:
|
||||
if sizes_gib >= 1024:
|
||||
total_size = f"{sizes_gib / 1024:.3f} TiB"
|
||||
elif sizes_gib >= 1:
|
||||
total_size = f"{sizes_gib:.3f} GiB"
|
||||
else:
|
||||
total_size = f"{sizes_gib * 1024:.3f} MiB"
|
||||
|
||||
# For PBS: calculate total time if not stated
|
||||
if not total_time and vms:
|
||||
total_secs = 0
|
||||
for vm in vms:
|
||||
t = vm.get('time', '')
|
||||
# Parse HH:MM:SS format
|
||||
m = re.match(r'(\d+):(\d+):(\d+)', t)
|
||||
if m:
|
||||
total_secs += int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3))
|
||||
if total_secs > 0:
|
||||
hours = total_secs // 3600
|
||||
mins = (total_secs % 3600) // 60
|
||||
secs = total_secs % 60
|
||||
if hours:
|
||||
total_time = f"{hours}h {mins}m {secs}s"
|
||||
elif mins:
|
||||
total_time = f"{mins}m {secs}s"
|
||||
else:
|
||||
total_time = f"{secs}s"
|
||||
|
||||
if not vms and not total_size:
|
||||
return None
|
||||
|
||||
@@ -113,7 +233,12 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str:
|
||||
if vm.get('time'):
|
||||
details.append(f"Duration: {vm['time']}")
|
||||
if vm.get('filename'):
|
||||
details.append(f"File: {vm['filename']}")
|
||||
fname = vm['filename']
|
||||
# PBS archives look like "ct/100/2026-..." or "vm/105/2026-..."
|
||||
if re.match(r'^(?:ct|vm)/\d+/', fname):
|
||||
details.append(f"PBS: {fname}")
|
||||
else:
|
||||
details.append(f"File: {fname}")
|
||||
if details:
|
||||
parts.append(' | '.join(details))
|
||||
parts.append('') # blank line between VMs
|
||||
@@ -338,6 +463,12 @@ TEMPLATES = {
|
||||
'group': 'storage',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'storage_unavailable': {
|
||||
'title': '{hostname}: Storage unavailable - {storage_name}',
|
||||
'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',
|
||||
'group': 'storage',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'load_high': {
|
||||
'title': '{hostname}: High system load ({value})',
|
||||
'body': 'System load average: {value} on {cores} cores.\n{details}',
|
||||
|
||||
Reference in New Issue
Block a user