mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-25 04:50:38 +00:00
Update notification service
This commit is contained in:
@@ -280,7 +280,7 @@ class JournalWatcher:
|
|||||||
return
|
return
|
||||||
|
|
||||||
def _check_service_failure(self, msg: str, unit: str):
|
def _check_service_failure(self, msg: str, unit: str):
|
||||||
"""Detect critical service failures."""
|
"""Detect critical service failures with enriched context."""
|
||||||
service_patterns = [
|
service_patterns = [
|
||||||
r'Failed to start (.+)',
|
r'Failed to start (.+)',
|
||||||
r'Unit (\S+) (?:entered failed state|failed)',
|
r'Unit (\S+) (?:entered failed state|failed)',
|
||||||
@@ -291,13 +291,60 @@ class JournalWatcher:
|
|||||||
match = re.search(pattern, msg)
|
match = re.search(pattern, msg)
|
||||||
if match:
|
if match:
|
||||||
service_name = match.group(1)
|
service_name = match.group(1)
|
||||||
self._emit('service_fail', 'WARNING', {
|
data = {
|
||||||
'service_name': service_name,
|
'service_name': service_name,
|
||||||
'reason': msg[:200],
|
'reason': msg[:300],
|
||||||
'hostname': self._hostname,
|
'hostname': self._hostname,
|
||||||
}, entity='node', entity_id=service_name)
|
}
|
||||||
|
|
||||||
|
# Enrich PVE VM/CT services with guest name and context
|
||||||
|
# pve-container@101 -> LXC container 101
|
||||||
|
# qemu-server@100 -> QEMU VM 100
|
||||||
|
pve_match = re.match(
|
||||||
|
r'(pve-container|qemu-server)@(\d+)', service_name)
|
||||||
|
if pve_match:
|
||||||
|
svc_type = pve_match.group(1)
|
||||||
|
vmid = pve_match.group(2)
|
||||||
|
vm_name = self._resolve_vm_name(vmid)
|
||||||
|
|
||||||
|
if svc_type == 'pve-container':
|
||||||
|
guest_type = 'LXC container'
|
||||||
|
else:
|
||||||
|
guest_type = 'QEMU VM'
|
||||||
|
|
||||||
|
display = f"{guest_type} {vmid}"
|
||||||
|
if vm_name:
|
||||||
|
display = f"{guest_type} {vmid} ({vm_name})"
|
||||||
|
|
||||||
|
data['service_name'] = service_name
|
||||||
|
data['vmid'] = vmid
|
||||||
|
data['vmname'] = vm_name
|
||||||
|
data['guest_type'] = guest_type
|
||||||
|
data['display_name'] = display
|
||||||
|
data['reason'] = (
|
||||||
|
f"{display} failed to start.\n{msg[:300]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
self._emit('service_fail', 'WARNING', data,
|
||||||
|
entity='node', entity_id=service_name)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def _resolve_vm_name(self, vmid: str) -> str:
|
||||||
|
"""Try to resolve VMID to a guest name from PVE config files."""
|
||||||
|
if not vmid:
|
||||||
|
return ''
|
||||||
|
# Check QEMU configs
|
||||||
|
for base in ['/etc/pve/qemu-server', '/etc/pve/lxc']:
|
||||||
|
conf = os.path.join(base, f'{vmid}.conf')
|
||||||
|
try:
|
||||||
|
with open(conf) as f:
|
||||||
|
for line in f:
|
||||||
|
if line.startswith('hostname:') or line.startswith('name:'):
|
||||||
|
return line.split(':', 1)[1].strip()
|
||||||
|
except (OSError, IOError):
|
||||||
|
continue
|
||||||
|
return ''
|
||||||
|
|
||||||
def _check_disk_io(self, msg: str, syslog_id: str, priority: int):
|
def _check_disk_io(self, msg: str, syslog_id: str, priority: int):
|
||||||
"""Detect disk I/O errors from kernel messages."""
|
"""Detect disk I/O errors from kernel messages."""
|
||||||
if syslog_id != 'kernel' and priority > 3:
|
if syslog_id != 'kernel' and priority > 3:
|
||||||
@@ -457,9 +504,6 @@ class TaskWatcher:
|
|||||||
self._thread: Optional[threading.Thread] = None
|
self._thread: Optional[threading.Thread] = None
|
||||||
self._hostname = _hostname()
|
self._hostname = _hostname()
|
||||||
self._last_position = 0
|
self._last_position = 0
|
||||||
# Set by NotificationManager to point at ProxmoxHookWatcher._delivered
|
|
||||||
# so we can skip events the webhook already delivered with richer data.
|
|
||||||
self._webhook_delivered: Optional[dict] = None
|
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
if self._running:
|
if self._running:
|
||||||
@@ -575,22 +619,13 @@ class TaskWatcher:
|
|||||||
# Determine entity type from task type
|
# Determine entity type from task type
|
||||||
entity = 'ct' if task_type.startswith('vz') else 'vm'
|
entity = 'ct' if task_type.startswith('vz') else 'vm'
|
||||||
|
|
||||||
# ── Cross-source dedup: yield to PVE webhook for backup/replication ──
|
# Backup and replication events are handled EXCLUSIVELY by the PVE
|
||||||
# The webhook delivers richer data (full logs, sizes, durations).
|
# webhook, which delivers much richer data (full logs, sizes, durations,
|
||||||
# If the webhook already delivered this event within 120s, skip.
|
# filenames). TaskWatcher skips these entirely to avoid duplicates.
|
||||||
# For backup events, PVE sends ONE webhook for the entire vzdump job
|
_WEBHOOK_EXCLUSIVE = {'backup_complete', 'backup_fail', 'backup_start',
|
||||||
# (covering all VMs), while TaskWatcher sees individual per-VM tasks.
|
'replication_complete', 'replication_fail'}
|
||||||
# So we check by event_type ONLY (no VMID) -- if ANY backup_complete
|
if event_type in _WEBHOOK_EXCLUSIVE:
|
||||||
# arrived from webhook recently, skip ALL backup_complete from tasks.
|
return
|
||||||
_WEBHOOK_TYPES = {'backup_complete', 'backup_fail', 'backup_start',
|
|
||||||
'replication_complete', 'replication_fail'}
|
|
||||||
if event_type in _WEBHOOK_TYPES and self._webhook_delivered:
|
|
||||||
import time as _time
|
|
||||||
# Check type-only key first (covers multi-VM jobs)
|
|
||||||
type_key = f"{event_type}:"
|
|
||||||
for dkey, dtime in self._webhook_delivered.items():
|
|
||||||
if dkey.startswith(type_key) and (_time.time() - dtime) < 120:
|
|
||||||
return # Webhook already delivered this with richer data
|
|
||||||
|
|
||||||
self._queue.put(NotificationEvent(
|
self._queue.put(NotificationEvent(
|
||||||
event_type, severity, data, source='tasks',
|
event_type, severity, data, source='tasks',
|
||||||
@@ -1029,18 +1064,6 @@ class ProxmoxHookWatcher:
|
|||||||
if dur_m:
|
if dur_m:
|
||||||
data['duration'] = dur_m.group(1).strip()
|
data['duration'] = dur_m.group(1).strip()
|
||||||
|
|
||||||
# Record this event for cross-source dedup.
|
|
||||||
# TaskWatcher iterates this dict checking if any key with the same
|
|
||||||
# event_type prefix was delivered recently (within 120s).
|
|
||||||
import time
|
|
||||||
self._delivered[f"{event_type}:{entity_id}"] = time.time()
|
|
||||||
# Cleanup old entries (use del, NOT reassign -- TaskWatcher holds a ref)
|
|
||||||
if len(self._delivered) > 200:
|
|
||||||
cutoff = time.time() - 300
|
|
||||||
stale = [k for k, v in self._delivered.items() if v < cutoff]
|
|
||||||
for k in stale:
|
|
||||||
del self._delivered[k]
|
|
||||||
|
|
||||||
event = NotificationEvent(
|
event = NotificationEvent(
|
||||||
event_type=event_type,
|
event_type=event_type,
|
||||||
severity=severity,
|
severity=severity,
|
||||||
|
|||||||
@@ -874,10 +874,6 @@ class NotificationManager:
|
|||||||
"""Process incoming Proxmox webhook. Delegates to ProxmoxHookWatcher."""
|
"""Process incoming Proxmox webhook. Delegates to ProxmoxHookWatcher."""
|
||||||
if not self._hook_watcher:
|
if not self._hook_watcher:
|
||||||
self._hook_watcher = ProxmoxHookWatcher(self._event_queue)
|
self._hook_watcher = ProxmoxHookWatcher(self._event_queue)
|
||||||
# Share the webhook's delivery record with TaskWatcher
|
|
||||||
# so tasks can yield to richer webhook data for backup/replication.
|
|
||||||
if self._task_watcher:
|
|
||||||
self._task_watcher._webhook_delivered = self._hook_watcher._delivered
|
|
||||||
return self._hook_watcher.process_webhook(payload)
|
return self._hook_watcher.process_webhook(payload)
|
||||||
|
|
||||||
def get_webhook_secret(self) -> str:
|
def get_webhook_secret(self) -> str:
|
||||||
|
|||||||
@@ -103,36 +103,40 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str:
|
|||||||
|
|
||||||
for vm in parsed.get('vms', []):
|
for vm in parsed.get('vms', []):
|
||||||
status = vm.get('status', '').lower()
|
status = vm.get('status', '').lower()
|
||||||
if status == 'ok':
|
icon = '\u2705' if status == 'ok' else '\u274C'
|
||||||
icon = '\u2705' # green check
|
|
||||||
else:
|
|
||||||
icon = '\u274C' # red X
|
|
||||||
|
|
||||||
vm_line = f"{icon} ID {vm['vmid']} ({vm['name']})"
|
parts.append(f"{icon} ID {vm['vmid']} ({vm['name']})")
|
||||||
parts.append(vm_line)
|
|
||||||
|
|
||||||
|
details = []
|
||||||
if vm.get('size'):
|
if vm.get('size'):
|
||||||
parts.append(f" Size: {vm['size']}")
|
details.append(f"Size: {vm['size']}")
|
||||||
if vm.get('time'):
|
if vm.get('time'):
|
||||||
parts.append(f" Duration: {vm['time']}")
|
details.append(f"Duration: {vm['time']}")
|
||||||
if vm.get('filename'):
|
if vm.get('filename'):
|
||||||
parts.append(f" File: {vm['filename']}")
|
details.append(f"File: {vm['filename']}")
|
||||||
|
if details:
|
||||||
|
parts.append(' | '.join(details))
|
||||||
parts.append('') # blank line between VMs
|
parts.append('') # blank line between VMs
|
||||||
|
|
||||||
# Summary
|
# Summary
|
||||||
vm_count = parsed.get('vm_count', 0)
|
vm_count = parsed.get('vm_count', 0)
|
||||||
if vm_count > 0 or parsed.get('total_size'):
|
if vm_count > 0 or parsed.get('total_size'):
|
||||||
parts.append('Summary:')
|
ok_count = sum(1 for v in parsed.get('vms', [])
|
||||||
|
if v.get('status', '').lower() == 'ok')
|
||||||
|
fail_count = vm_count - ok_count
|
||||||
|
|
||||||
|
summary_parts = []
|
||||||
if vm_count:
|
if vm_count:
|
||||||
ok_count = sum(1 for v in parsed.get('vms', []) if v.get('status', '').lower() == 'ok')
|
summary_parts.append(f"{vm_count} backup(s)")
|
||||||
fail_count = vm_count - ok_count
|
if fail_count:
|
||||||
parts.append(f" Total: {vm_count} backup(s)")
|
summary_parts.append(f"{fail_count} failed")
|
||||||
if fail_count:
|
|
||||||
parts.append(f" Failed: {fail_count}")
|
|
||||||
if parsed.get('total_size'):
|
if parsed.get('total_size'):
|
||||||
parts.append(f" Total size: {parsed['total_size']}")
|
summary_parts.append(f"Total: {parsed['total_size']}")
|
||||||
if parsed.get('total_time'):
|
if parsed.get('total_time'):
|
||||||
parts.append(f" Total time: {parsed['total_time']}")
|
summary_parts.append(f"Time: {parsed['total_time']}")
|
||||||
|
|
||||||
|
if summary_parts:
|
||||||
|
parts.append('--- ' + ' | '.join(summary_parts))
|
||||||
|
|
||||||
return '\n'.join(parts)
|
return '\n'.join(parts)
|
||||||
|
|
||||||
@@ -422,7 +426,7 @@ TEMPLATES = {
|
|||||||
},
|
},
|
||||||
'service_fail': {
|
'service_fail': {
|
||||||
'title': '{hostname}: Service failed - {service_name}',
|
'title': '{hostname}: Service failed - {service_name}',
|
||||||
'body': 'Service {service_name} has failed.\n{reason}',
|
'body': '{reason}',
|
||||||
'group': 'system',
|
'group': 'system',
|
||||||
'default_enabled': True,
|
'default_enabled': True,
|
||||||
},
|
},
|
||||||
@@ -619,16 +623,9 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]:
|
|||||||
except (KeyError, ValueError):
|
except (KeyError, ValueError):
|
||||||
body_text = template['body']
|
body_text = template['body']
|
||||||
|
|
||||||
# Clean up: remove empty lines and consecutive duplicate lines
|
# Clean up: collapse runs of 3+ blank lines into 1, remove trailing whitespace
|
||||||
cleaned_lines = []
|
import re as _re
|
||||||
for line in body_text.split('\n'):
|
body_text = _re.sub(r'\n{3,}', '\n\n', body_text.strip())
|
||||||
stripped = line.strip()
|
|
||||||
if not stripped:
|
|
||||||
continue
|
|
||||||
if cleaned_lines and stripped == cleaned_lines[-1]:
|
|
||||||
continue # skip consecutive duplicate
|
|
||||||
cleaned_lines.append(stripped)
|
|
||||||
body_text = '\n'.join(cleaned_lines)
|
|
||||||
|
|
||||||
severity = variables.get('severity', 'INFO')
|
severity = variables.get('severity', 'INFO')
|
||||||
group = template.get('group', 'system')
|
group = template.get('group', 'system')
|
||||||
|
|||||||
Reference in New Issue
Block a user