Update notification service

This commit is contained in:
MacRimi
2026-03-03 13:40:46 +01:00
parent f0b8ed20a2
commit da3f99a254
5 changed files with 475 additions and 141 deletions

View File

@@ -40,13 +40,18 @@ interface EventTypeInfo {
default_enabled: boolean
}
interface ChannelOverrides {
categories: Record<string, boolean>
events: Record<string, boolean>
}
interface NotificationConfig {
enabled: boolean
channels: Record<string, ChannelConfig>
severity_filter: string
event_categories: Record<string, boolean>
event_toggles: Record<string, boolean>
event_types_by_group: Record<string, EventTypeInfo[]>
channel_overrides: Record<string, ChannelOverrides>
ai_enabled: boolean
ai_provider: string
ai_api_key: string
@@ -79,23 +84,22 @@ interface HistoryEntry {
error_message: string | null
}
const SEVERITY_OPTIONS = [
{ value: "critical", label: "Critical only" },
{ value: "warning", label: "Warning + Critical" },
{ value: "info", label: "All (Info + Warning + Critical)" },
]
const EVENT_CATEGORIES = [
{ key: "system", label: "System", desc: "Startup, shutdown, kernel events" },
{ key: "vm_ct", label: "VM / CT", desc: "Start, stop, crash, migration" },
{ key: "backup", label: "Backups", desc: "Backup start, complete, fail" },
{ key: "resources", label: "Resources", desc: "CPU, memory, temperature" },
{ key: "storage", label: "Storage", desc: "Disk space, I/O errors, SMART" },
{ key: "storage", label: "Storage", desc: "Disk space, I/O, SMART" },
{ key: "network", label: "Network", desc: "Connectivity, bond, latency" },
{ key: "security", label: "Security", desc: "Auth failures, fail2ban, firewall" },
{ key: "security", label: "Security", desc: "Auth failures, Fail2Ban, firewall" },
{ key: "cluster", label: "Cluster", desc: "Quorum, split-brain, HA fencing" },
{ key: "services", label: "Services", desc: "System services, shutdown, reboot" },
{ key: "health", label: "Health Monitor", desc: "Health checks, degradation, recovery" },
{ key: "updates", label: "Updates", desc: "System and PVE updates" },
{ key: "other", label: "Other", desc: "Uncategorized notifications" },
]
const CHANNEL_TYPES = ["telegram", "gotify", "discord", "email"] as const
const AI_PROVIDERS = [
{ value: "openai", label: "OpenAI" },
{ value: "groq", label: "Groq" },
@@ -109,13 +113,19 @@ const DEFAULT_CONFIG: NotificationConfig = {
discord: { enabled: false },
email: { enabled: false },
},
severity_filter: "all",
event_categories: {
system: true, vm_ct: true, backup: true, resources: true,
storage: true, network: true, security: true, cluster: true,
vm_ct: true, backup: true, resources: true, storage: true,
network: true, security: true, cluster: true, services: true,
health: true, updates: true, other: true,
},
event_toggles: {},
event_types_by_group: {},
channel_overrides: {
telegram: { categories: {}, events: {} },
gotify: { categories: {}, events: {} },
discord: { categories: {}, events: {} },
email: { categories: {}, events: {} },
},
ai_enabled: false,
ai_provider: "openai",
ai_api_key: "",
@@ -217,7 +227,6 @@ export function NotificationSettings() {
const flattenConfig = (cfg: NotificationConfig): Record<string, string> => {
const flat: Record<string, string> = {
enabled: String(cfg.enabled),
severity_filter: cfg.severity_filter,
ai_enabled: String(cfg.ai_enabled),
ai_provider: cfg.ai_provider,
ai_api_key: cfg.ai_api_key,
@@ -235,20 +244,17 @@ export function NotificationSettings() {
flat[`${chName}.${field}`] = String(value ?? "")
}
}
// Flatten event_categories: { system: true, backups: false } -> events.system, events.backups
// Flatten global event_categories: { vm_ct: true, backup: false } -> events.vm_ct, events.backup
for (const [cat, enabled] of Object.entries(cfg.event_categories)) {
flat[`events.${cat}`] = String(enabled)
}
// Flatten event_toggles: { vm_start: true, vm_stop: false } -> event.vm_start, event.vm_stop
// Always write ALL toggles to DB so the backend has an explicit record.
// This ensures default_enabled changes in templates don't get overridden by stale DB values.
// Flatten global event_toggles: { vm_start: true } -> event.vm_start
if (cfg.event_toggles) {
for (const [evt, enabled] of Object.entries(cfg.event_toggles)) {
flat[`event.${evt}`] = String(enabled)
}
}
// Also write any events NOT in event_toggles using their template defaults.
// This covers newly added templates whose default_enabled may be false.
// Write defaults for events NOT in toggles
if (cfg.event_types_by_group) {
for (const events of Object.values(cfg.event_types_by_group)) {
for (const evt of (events as Array<{type: string, default_enabled: boolean}>)) {
@@ -259,6 +265,21 @@ export function NotificationSettings() {
}
}
}
// Flatten per-channel overrides: telegram.events.backup, telegram.event.vm_start, etc.
if (cfg.channel_overrides) {
for (const [chName, overrides] of Object.entries(cfg.channel_overrides)) {
if (overrides.categories) {
for (const [cat, enabled] of Object.entries(overrides.categories)) {
flat[`${chName}.events.${cat}`] = String(enabled)
}
}
if (overrides.events) {
for (const [evt, enabled] of Object.entries(overrides.events)) {
flat[`${chName}.event.${evt}`] = String(enabled)
}
}
}
}
return flat
}
@@ -1052,27 +1073,8 @@ matcher: proxmenux-pbs
<span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">Filters & Events</span>
</div>
<div className="rounded-lg border border-border/50 bg-muted/20 p-3 space-y-4">
{/* Severity */}
{/* Event Categories (global defaults -- per-channel overrides in Channel Filters below) */}
<div className="space-y-1.5">
<Label className="text-[11px] text-muted-foreground">Severity Filter</Label>
<Select
value={config.severity_filter}
onValueChange={v => updateConfig(p => ({ ...p, severity_filter: v }))}
disabled={!editMode}
>
<SelectTrigger className={`h-8 text-xs ${!editMode ? "opacity-60" : ""}`}>
<SelectValue />
</SelectTrigger>
<SelectContent>
{SEVERITY_OPTIONS.map(opt => (
<SelectItem key={opt.value} value={opt.value}>{opt.label}</SelectItem>
))}
</SelectContent>
</Select>
</div>
{/* Event Categories */}
<div className="space-y-1.5 border-t border-border/30 pt-3">
<Label className="text-[11px] text-muted-foreground">Event Categories</Label>
<div className="space-y-1.5">
{EVENT_CATEGORIES.map(cat => {
@@ -1198,6 +1200,118 @@ matcher: proxmenux-pbs
})}
</div>
</div>
{/* Per-channel overrides */}
<div className="space-y-2 border-t border-border/30 pt-3">
<Label className="text-[11px] text-muted-foreground">Channel Filters</Label>
<p className="text-[10px] text-muted-foreground leading-relaxed">
By default every channel inherits the global settings above. Override specific categories per channel to customize what each destination receives.
</p>
<div className="space-y-2">
{CHANNEL_TYPES.map(chName => {
const chEnabled = config.channels[chName]?.enabled
if (!chEnabled) return null
const overrides = config.channel_overrides?.[chName] || { categories: {}, events: {} }
const hasOverrides = Object.keys(overrides.categories).length > 0
const chLabel = chName === "email" ? "Email" : chName.charAt(0).toUpperCase() + chName.slice(1)
const chColor = chName === "telegram" ? "blue" : chName === "gotify" ? "green" : chName === "discord" ? "indigo" : "amber"
return (
<details key={chName} className="group">
<summary className={`flex items-center justify-between text-[11px] font-medium cursor-pointer hover:text-foreground transition-colors py-1.5 px-2 rounded-md hover:bg-muted/50 ${
hasOverrides ? `text-${chColor}-400` : "text-muted-foreground"
}`}>
<div className="flex items-center gap-2">
<ChevronDown className="h-3 w-3 group-open:rotate-180 transition-transform" />
<span>{chLabel}</span>
{hasOverrides && (
<span className={`text-[9px] px-1.5 py-0.5 rounded-full bg-${chColor}-500/15 text-${chColor}-400`}>
customized
</span>
)}
</div>
{!hasOverrides && (
<span className="text-[9px] text-muted-foreground/60">inherits global</span>
)}
</summary>
<div className="mt-1.5 ml-5 space-y-1">
{EVENT_CATEGORIES.map(cat => {
const globalEnabled = config.event_categories[cat.key] ?? true
const override = overrides.categories[cat.key]
const isCustomized = override !== undefined
const effectiveEnabled = isCustomized ? override : globalEnabled
return (
<div key={cat.key} className="flex items-center justify-between py-1 px-2 rounded hover:bg-muted/30">
<div className="flex items-center gap-2">
<span className={`text-[11px] ${effectiveEnabled ? "text-foreground" : "text-muted-foreground/50"}`}>
{cat.label}
</span>
{!isCustomized && (
<span className="text-[9px] text-muted-foreground/40">global</span>
)}
</div>
<div className="flex items-center gap-1.5">
{isCustomized && (
<button
type="button"
className="text-[9px] text-muted-foreground hover:text-foreground px-1"
disabled={!editMode}
onClick={() => {
if (!editMode) return
updateConfig(p => {
const ch = { ...(p.channel_overrides?.[chName] || { categories: {}, events: {} }) }
const cats = { ...ch.categories }
delete cats[cat.key]
return { ...p, channel_overrides: { ...p.channel_overrides, [chName]: { ...ch, categories: cats } } }
})
}}
>
reset
</button>
)}
<button
type="button"
role="switch"
aria-checked={effectiveEnabled}
disabled={!editMode}
className={`relative inline-flex h-3.5 w-6 shrink-0 items-center rounded-full transition-colors ${
!editMode ? "opacity-50 cursor-not-allowed" : "cursor-pointer"
} ${effectiveEnabled ? `bg-${chColor}-600` : "bg-muted-foreground/30"}`}
onClick={() => {
if (!editMode) return
updateConfig(p => {
const ch = { ...(p.channel_overrides?.[chName] || { categories: {}, events: {} }) }
return {
...p,
channel_overrides: {
...p.channel_overrides,
[chName]: { ...ch, categories: { ...ch.categories, [cat.key]: !effectiveEnabled } }
}
}
})
}}
>
<span className={`pointer-events-none block h-2.5 w-2.5 rounded-full bg-background shadow-sm transition-transform ${
effectiveEnabled ? "translate-x-3" : "translate-x-0.5"
}`} />
</button>
</div>
</div>
)
})}
</div>
</details>
)
})}
{CHANNEL_TYPES.every(ch => !config.channels[ch]?.enabled) && (
<p className="text-[10px] text-muted-foreground/50 italic py-2">
Enable at least one channel above to configure per-channel filters.
</p>
)}
</div>
</div>
</div>{/* close bordered filters container */}
</div>

View File

@@ -575,13 +575,31 @@ def _temperature_collector_loop():
def _health_collector_loop():
"""Background thread: run full health checks every 5 minutes.
Keeps the health cache always fresh and records events/errors in the DB
so the future notification service can consume them."""
Keeps the health cache always fresh and records events/errors in the DB.
Also emits notifications when a health category degrades (OK -> WARNING/CRITICAL)."""
from health_monitor import health_monitor
# Wait 30s after startup to let other services initialize
time.sleep(30)
# Track previous status per category to detect transitions
_prev_statuses = {}
# Severity ranking for comparison
_SEV_RANK = {'OK': 0, 'INFO': 0, 'UNKNOWN': 1, 'WARNING': 2, 'CRITICAL': 3}
# Human-readable category names
_CAT_NAMES = {
'cpu': 'CPU Usage & Temperature',
'memory': 'Memory & Swap',
'storage': 'Storage Mounts & Space',
'disks': 'Disk I/O & Errors',
'network': 'Network Interfaces',
'vms': 'VMs & Containers',
'services': 'PVE Services',
'logs': 'System Logs',
'updates': 'System Updates',
'security': 'Security',
}
while True:
try:
# Run full health check (results get cached internally + recorded in DB)
@@ -598,6 +616,64 @@ def _health_collector_loop():
health_monitor.cached_results['_bg_detailed'] = result
health_monitor.last_check_times['_bg_overall'] = time.time()
health_monitor.last_check_times['_bg_detailed'] = time.time()
# ── Health degradation notifications ──
# Compare each category's current status to previous cycle.
# Notify when a category DEGRADES (OK->WARNING, WARNING->CRITICAL, etc.)
# Include the detailed 'reason' so the user knows exactly what triggered it.
details = result.get('details', {})
degraded = []
for cat_key, cat_data in details.items():
cur_status = cat_data.get('status', 'OK')
prev_status = _prev_statuses.get(cat_key, 'OK')
cur_rank = _SEV_RANK.get(cur_status, 0)
prev_rank = _SEV_RANK.get(prev_status, 0)
if cur_rank > prev_rank and cur_rank >= 2: # WARNING or CRITICAL
reason = cat_data.get('reason', f'{cat_key} status changed to {cur_status}')
cat_name = _CAT_NAMES.get(cat_key, cat_key)
degraded.append({
'category': cat_name,
'status': cur_status,
'reason': reason,
})
_prev_statuses[cat_key] = cur_status
# Send grouped notification if any categories degraded
if degraded and notification_manager._enabled:
hostname = result.get('hostname', '')
if not hostname:
import socket as _sock
hostname = _sock.gethostname()
if len(degraded) == 1:
d = degraded[0]
title = f"{hostname}: Health {d['status']} - {d['category']}"
body = d['reason']
severity = d['status']
else:
# Multiple categories degraded at once -- group them
max_sev = max(degraded, key=lambda x: _SEV_RANK.get(x['status'], 0))['status']
title = f"{hostname}: {len(degraded)} health checks degraded"
lines = []
for d in degraded:
lines.append(f" [{d['status']}] {d['category']}: {d['reason']}")
body = '\n'.join(lines)
severity = max_sev
try:
notification_manager.send_notification(
event_type='health_degraded',
severity=severity,
title=title,
message=body,
data={'hostname': hostname, 'count': str(len(degraded))},
source='health_monitor',
)
except Exception as e:
print(f"[ProxMenux] Health notification error: {e}")
except Exception as e:
print(f"[ProxMenux] Health collector error: {e}")

View File

@@ -2778,24 +2778,41 @@ class HealthMonitor:
return 'INFO'
return severity
# Build detail strings that include the actual error samples
# so the user can see exactly WHAT is triggering the warning.
if cascade_count > 0:
cascade_detail = f'{cascade_count} pattern(s) repeating >=15 times: ' + '; '.join(cascade_samples)
else:
cascade_detail = 'No cascading errors'
if spike_count > 0:
spike_detail = f'{spike_count} pattern(s) with 4x increase: ' + '; '.join(spike_samples)
else:
spike_detail = 'No error spikes'
if persistent_count > 0:
persist_detail = f'{persistent_count} recurring pattern(s) over 15+ min: ' + '; '.join(persist_samples)
else:
persist_detail = 'No persistent patterns'
log_checks = {
'log_error_cascade': {
'status': _log_check_status('log_error_cascade', cascade_count > 0, 'WARNING'),
'detail': f'{cascade_count} pattern(s) repeating >=15 times' if cascade_count > 0 else 'No cascading errors',
'detail': cascade_detail,
'dismissable': True,
'dismissed': 'log_error_cascade' in dismissed_keys,
'error_key': 'log_error_cascade'
},
'log_error_spike': {
'status': _log_check_status('log_error_spike', spike_count > 0, 'WARNING'),
'detail': f'{spike_count} pattern(s) with 4x increase' if spike_count > 0 else 'No error spikes',
'detail': spike_detail,
'dismissable': True,
'dismissed': 'log_error_spike' in dismissed_keys,
'error_key': 'log_error_spike'
},
'log_persistent_errors': {
'status': _log_check_status('log_persistent_errors', persistent_count > 0, 'WARNING'),
'detail': f'{persistent_count} recurring pattern(s) over 15+ min' if persistent_count > 0 else 'No persistent patterns',
'detail': persist_detail,
'dismissable': True,
'dismissed': 'log_persistent_errors' in dismissed_keys,
'error_key': 'log_persistent_errors'

View File

@@ -69,9 +69,15 @@ GROUP_RATE_LIMITS = {
'resources': {'max_per_minute': 3, 'max_per_hour': 20},
'vm_ct': {'max_per_minute': 10, 'max_per_hour': 60},
'backup': {'max_per_minute': 5, 'max_per_hour': 30},
'system': {'max_per_minute': 5, 'max_per_hour': 30},
'services': {'max_per_minute': 5, 'max_per_hour': 30},
'health': {'max_per_minute': 3, 'max_per_hour': 20},
'updates': {'max_per_minute': 3, 'max_per_hour': 15},
'other': {'max_per_minute': 5, 'max_per_hour': 30},
}
# Default fallback for unknown groups
_DEFAULT_RATE_LIMIT = {'max_per_minute': 5, 'max_per_hour': 30}
class GroupRateLimiter:
"""Rate limiter per event group. Prevents notification storms."""
@@ -84,7 +90,7 @@ class GroupRateLimiter:
def allow(self, group: str) -> bool:
"""Check if group rate limit allows this event."""
limits = GROUP_RATE_LIMITS.get(group, GROUP_RATE_LIMITS['system'])
limits = GROUP_RATE_LIMITS.get(group, _DEFAULT_RATE_LIMIT)
now = time.time()
# Initialize if needed
@@ -554,35 +560,28 @@ class NotificationManager:
print(f"[NotificationManager] Aggregation flush error: {e}")
def _process_event(self, event: NotificationEvent):
"""Process a single event: filter -> aggregate -> cooldown -> rate limit -> dispatch."""
"""Process a single event: filter -> aggregate -> cooldown -> rate limit -> dispatch.
NOTE: Group and per-event filters are checked globally here.
Per-channel overrides are applied later in _dispatch_to_channels().
"""
if not self._enabled:
return
# Check if this event's GROUP is enabled in settings.
# The UI saves categories by group key: events.vm_ct, events.backup, etc.
# Check if this event's GROUP is enabled globally.
template = TEMPLATES.get(event.event_type, {})
event_group = template.get('group', 'system')
event_group = template.get('group', 'other')
group_setting = f'events.{event_group}'
if self._config.get(group_setting, 'true') == 'false':
return
# Check if this SPECIFIC event type is enabled (granular per-event toggle).
# Key format: event.{event_type} = "true"/"false"
# Check if this SPECIFIC event type is enabled globally.
# Default comes from the template's default_enabled field.
default_enabled = 'true' if template.get('default_enabled', True) else 'false'
event_specific = f'event.{event.event_type}'
if self._config.get(event_specific, default_enabled) == 'false':
return
# Check severity filter.
# The UI saves severity_filter as: "all", "warning", "critical".
# Map to our internal severity names for comparison.
severity_map = {'all': 'INFO', 'warning': 'WARNING', 'critical': 'CRITICAL'}
raw_filter = self._config.get('severity_filter', 'all')
min_severity = severity_map.get(raw_filter.lower(), 'INFO')
if not self._meets_severity(event.severity, min_severity):
return
# Try aggregation (may buffer the event)
result = self._aggregator.ingest(event)
if result is None:
@@ -593,30 +592,23 @@ class NotificationManager:
self._dispatch_event(event)
def _process_event_direct(self, event: NotificationEvent):
"""Process a burst summary event. Bypasses aggregator but applies ALL other filters."""
"""Process a burst summary event. Bypasses aggregator but applies global filters."""
if not self._enabled:
return
# Check group filter (same as _process_event)
# Check group filter
template = TEMPLATES.get(event.event_type, {})
event_group = template.get('group', 'system')
event_group = template.get('group', 'other')
group_setting = f'events.{event_group}'
if self._config.get(group_setting, 'true') == 'false':
return
# Check per-event filter (same as _process_event)
# Check per-event filter
default_enabled = 'true' if template.get('default_enabled', True) else 'false'
event_specific = f'event.{event.event_type}'
if self._config.get(event_specific, default_enabled) == 'false':
return
# Check severity filter (same mapping as _process_event)
severity_map = {'all': 'INFO', 'warning': 'WARNING', 'critical': 'CRITICAL'}
raw_filter = self._config.get('severity_filter', 'all')
min_severity = severity_map.get(raw_filter.lower(), 'INFO')
if not self._meets_severity(event.severity, min_severity):
return
self._dispatch_event(event)
def _dispatch_event(self, event: NotificationEvent):
@@ -636,7 +628,7 @@ class NotificationManager:
# Check group rate limit
template = TEMPLATES.get(event.event_type, {})
group = template.get('group', 'system')
group = template.get('group', 'other')
if not self._group_limiter.allow(group):
return
@@ -674,11 +666,33 @@ class NotificationManager:
def _dispatch_to_channels(self, title: str, body: str, severity: str,
event_type: str, data: Dict, source: str):
"""Send notification through all configured channels."""
"""Send notification through configured channels, respecting per-channel overrides.
Each channel can override global category/event settings:
- {channel}.events.{group} = "true"/"false" (category override)
- {channel}.event.{type} = "true"/"false" (per-event override)
If no override exists, the channel inherits the global setting (already checked).
"""
with self._lock:
channels = dict(self._channels)
template = TEMPLATES.get(event_type, {})
event_group = template.get('group', 'other')
for ch_name, channel in channels.items():
# ── Per-channel override check ──
# If the channel has an explicit override for this group or event, respect it.
# If no override, the global filter already passed (checked in _process_event).
ch_group_key = f'{ch_name}.events.{event_group}'
ch_group_override = self._config.get(ch_group_key)
if ch_group_override == 'false':
continue # Channel explicitly disabled this category
ch_event_key = f'{ch_name}.event.{event_type}'
ch_event_override = self._config.get(ch_event_key)
if ch_event_override == 'false':
continue # Channel explicitly disabled this event
try:
result = channel.send(title, body, severity, data)
self._record_history(
@@ -857,12 +871,6 @@ class NotificationManager:
except Exception:
pass
@staticmethod
def _meets_severity(event_severity: str, min_severity: str) -> bool:
"""Check if event severity meets the minimum threshold."""
levels = {'INFO': 0, 'WARNING': 1, 'CRITICAL': 2}
return levels.get(event_severity, 0) >= levels.get(min_severity, 0)
# ─── History Recording ──────────────────────────────────────
def _record_history(self, event_type: str, channel: str, title: str,
@@ -1171,7 +1179,7 @@ class NotificationManager:
channels[ch_type] = ch_cfg
# Build event_categories dict (group-level toggle)
# EVENT_GROUPS is a dict: { 'system': {...}, 'vm_ct': {...}, ... }
# EVENT_GROUPS is a dict: { 'vm_ct': {...}, 'services': {...}, 'health': {...}, ... }
event_categories = {}
for group_key in EVENT_GROUPS:
event_categories[group_key] = self._config.get(f'events.{group_key}', 'true') == 'true'
@@ -1189,13 +1197,28 @@ class NotificationManager:
# Build event_types_by_group for UI rendering
event_types_by_group = get_event_types_by_group()
# Build per-channel overrides
# Keys: {channel}.events.{group} and {channel}.event.{event_type}
channel_overrides = {}
for ch_type in CHANNEL_TYPES:
ch_overrides = {'categories': {}, 'events': {}}
for group_key in EVENT_GROUPS:
val = self._config.get(f'{ch_type}.events.{group_key}')
if val is not None:
ch_overrides['categories'][group_key] = val == 'true'
for event_type_key in TEMPLATES:
val = self._config.get(f'{ch_type}.event.{event_type_key}')
if val is not None:
ch_overrides['events'][event_type_key] = val == 'true'
channel_overrides[ch_type] = ch_overrides
config = {
'enabled': self._enabled,
'channels': channels,
'severity_filter': self._config.get('severity_filter', 'all'),
'event_categories': event_categories,
'event_toggles': event_toggles,
'event_types_by_group': event_types_by_group,
'channel_overrides': channel_overrides,
'ai_enabled': self._config.get('ai_enabled', 'false') == 'true',
'ai_provider': self._config.get('ai_provider', 'openai'),
'ai_api_key': self._config.get('ai_api_key', ''),

View File

@@ -342,25 +342,36 @@ TEMPLATES = {
'state_change': {
'title': '{hostname}: {category} changed to {current}',
'body': '{category} status changed from {previous} to {current}.\n{reason}',
'group': 'system',
'label': 'Health state changed',
'group': 'health',
'default_enabled': False,
},
'new_error': {
'title': '{hostname}: New {severity} - {category}',
'body': '{reason}',
'group': 'system',
'label': 'New health issue',
'group': 'health',
'default_enabled': True,
},
'error_resolved': {
'title': '{hostname}: Resolved - {category}',
'body': '{reason}\nDuration: {duration}',
'group': 'system',
'label': 'Health issue resolved',
'group': 'health',
'default_enabled': True,
},
'error_escalated': {
'title': '{hostname}: Escalated to {severity} - {category}',
'body': '{reason}',
'group': 'system',
'label': 'Health issue escalated',
'group': 'health',
'default_enabled': True,
},
'health_degraded': {
'title': '{hostname}: Health check degraded',
'body': '{reason}',
'label': 'Health check degraded',
'group': 'health',
'default_enabled': True,
},
@@ -368,90 +379,105 @@ TEMPLATES = {
'vm_start': {
'title': '{hostname}: VM {vmid} started',
'body': '{vmname} ({vmid}) has been started.',
'label': 'VM started',
'group': 'vm_ct',
'default_enabled': True,
},
'vm_stop': {
'title': '{hostname}: VM {vmid} stopped',
'body': '{vmname} ({vmid}) has been stopped.',
'label': 'VM stopped',
'group': 'vm_ct',
'default_enabled': False,
},
'vm_shutdown': {
'title': '{hostname}: VM {vmid} shutdown',
'body': '{vmname} ({vmid}) has been shut down.',
'label': 'VM shutdown',
'group': 'vm_ct',
'default_enabled': False,
},
'vm_fail': {
'title': '{hostname}: VM {vmid} FAILED',
'body': '{vmname} ({vmid}) has failed.\n{reason}',
'label': 'VM FAILED',
'group': 'vm_ct',
'default_enabled': True,
},
'vm_restart': {
'title': '{hostname}: VM {vmid} restarted',
'body': '{vmname} ({vmid}) has been restarted.',
'label': 'VM restarted',
'group': 'vm_ct',
'default_enabled': False,
},
'ct_start': {
'title': '{hostname}: CT {vmid} started',
'body': '{vmname} ({vmid}) has been started.',
'label': 'CT started',
'group': 'vm_ct',
'default_enabled': True,
},
'ct_stop': {
'title': '{hostname}: CT {vmid} stopped',
'body': '{vmname} ({vmid}) has been stopped.',
'label': 'CT stopped',
'group': 'vm_ct',
'default_enabled': False,
},
'ct_shutdown': {
'title': '{hostname}: CT {vmid} shutdown',
'body': '{vmname} ({vmid}) has been shut down.',
'label': 'CT shutdown',
'group': 'vm_ct',
'default_enabled': False,
},
'ct_restart': {
'title': '{hostname}: CT {vmid} restarted',
'body': '{vmname} ({vmid}) has been restarted.',
'label': 'CT restarted',
'group': 'vm_ct',
'default_enabled': False,
},
'ct_fail': {
'title': '{hostname}: CT {vmid} FAILED',
'body': '{vmname} ({vmid}) has failed.\n{reason}',
'label': 'CT FAILED',
'group': 'vm_ct',
'default_enabled': True,
},
'migration_start': {
'title': '{hostname}: Migration started - {vmid}',
'body': '{vmname} ({vmid}) migration to {target_node} started.',
'label': 'Migration started',
'group': 'vm_ct',
'default_enabled': True,
},
'migration_complete': {
'title': '{hostname}: Migration complete - {vmid}',
'body': '{vmname} ({vmid}) migrated successfully to {target_node}.',
'label': 'Migration complete',
'group': 'vm_ct',
'default_enabled': True,
},
'migration_fail': {
'title': '{hostname}: Migration FAILED - {vmid}',
'body': '{vmname} ({vmid}) migration to {target_node} failed.\n{reason}',
'label': 'Migration FAILED',
'group': 'vm_ct',
'default_enabled': True,
},
'replication_fail': {
'title': '{hostname}: Replication FAILED - {vmid}',
'body': 'Replication of {vmname} ({vmid}) has failed.\n{reason}',
'label': 'Replication FAILED',
'group': 'vm_ct',
'default_enabled': True,
},
'replication_complete': {
'title': '{hostname}: Replication complete - {vmid}',
'body': 'Replication of {vmname} ({vmid}) completed successfully.',
'label': 'Replication complete',
'group': 'vm_ct',
'default_enabled': False,
},
@@ -460,30 +486,35 @@ TEMPLATES = {
'backup_start': {
'title': '{hostname}: Backup started',
'body': '{reason}',
'label': 'Backup started',
'group': 'backup',
'default_enabled': False,
},
'backup_complete': {
'title': '{hostname}: Backup complete - {vmid}',
'body': 'Backup of {vmname} ({vmid}) completed successfully.\nSize: {size}',
'label': 'Backup complete',
'group': 'backup',
'default_enabled': True,
},
'backup_fail': {
'title': '{hostname}: Backup FAILED - {vmid}',
'body': 'Backup of {vmname} ({vmid}) has failed.\n{reason}',
'label': 'Backup FAILED',
'group': 'backup',
'default_enabled': True,
},
'snapshot_complete': {
'title': '{hostname}: Snapshot created - {vmid}',
'body': 'Snapshot of {vmname} ({vmid}) created: {snapshot_name}',
'label': 'Snapshot created',
'group': 'backup',
'default_enabled': False,
},
'snapshot_fail': {
'title': '{hostname}: Snapshot FAILED - {vmid}',
'body': 'Snapshot of {vmname} ({vmid}) failed.\n{reason}',
'label': 'Snapshot FAILED',
'group': 'backup',
'default_enabled': True,
},
@@ -492,42 +523,49 @@ TEMPLATES = {
'cpu_high': {
'title': '{hostname}: High CPU usage ({value}%)',
'body': 'CPU usage is at {value}% on {cores} cores.\n{details}',
'label': 'High CPU usage',
'group': 'resources',
'default_enabled': True,
},
'ram_high': {
'title': '{hostname}: High memory usage ({value}%)',
'body': 'Memory usage: {used} / {total} ({value}%).\n{details}',
'label': 'High memory usage',
'group': 'resources',
'default_enabled': True,
},
'temp_high': {
'title': '{hostname}: High temperature ({value}C)',
'body': 'CPU temperature: {value}C (threshold: {threshold}C).\n{details}',
'label': 'High temperature',
'group': 'resources',
'default_enabled': True,
},
'disk_space_low': {
'title': '{hostname}: Low disk space on {mount}',
'body': '{mount}: {used}% used ({available} available).',
'label': 'Low disk space',
'group': 'storage',
'default_enabled': True,
},
'disk_io_error': {
'title': '{hostname}: Disk failure detected on {device}',
'body': '{reason}',
'label': 'Disk failure / I/O error',
'group': 'storage',
'default_enabled': True,
},
'storage_unavailable': {
'title': '{hostname}: Storage unavailable - {storage_name}',
'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',
'label': 'Storage unavailable',
'group': 'storage',
'default_enabled': True,
},
'load_high': {
'title': '{hostname}: High system load ({value})',
'body': 'System load average: {value} on {cores} cores.\n{details}',
'label': 'High system load',
'group': 'resources',
'default_enabled': True,
},
@@ -536,12 +574,14 @@ TEMPLATES = {
'network_down': {
'title': '{hostname}: Network connectivity lost',
'body': 'Network connectivity check failed.\n{reason}',
'label': 'Network connectivity lost',
'group': 'network',
'default_enabled': True,
},
'network_latency': {
'title': '{hostname}: High network latency ({value}ms)',
'body': 'Latency to gateway: {value}ms (threshold: {threshold}ms).',
'label': 'High network latency',
'group': 'network',
'default_enabled': False,
},
@@ -550,24 +590,28 @@ TEMPLATES = {
'auth_fail': {
'title': '{hostname}: Authentication failure',
'body': 'Failed login attempt from {source_ip}.\nUser: {username}\nService: {service}',
'label': 'Authentication failure',
'group': 'security',
'default_enabled': True,
},
'ip_block': {
'title': '{hostname}: IP blocked by Fail2Ban',
'body': 'IP {source_ip} has been banned.\nJail: {jail}\nFailures: {failures}',
'label': 'IP blocked by Fail2Ban',
'group': 'security',
'default_enabled': True,
},
'firewall_issue': {
'title': '{hostname}: Firewall issue detected',
'body': '{reason}',
'label': 'Firewall issue detected',
'group': 'security',
'default_enabled': True,
},
'user_permission_change': {
'title': '{hostname}: User permission changed',
'body': 'User: {username}\nChange: {change_details}',
'label': 'User permission changed',
'group': 'security',
'default_enabled': True,
},
@@ -576,101 +620,128 @@ TEMPLATES = {
'split_brain': {
'title': '{hostname}: SPLIT-BRAIN detected',
'body': 'Cluster split-brain condition detected.\nQuorum status: {quorum}',
'label': 'SPLIT-BRAIN detected',
'group': 'cluster',
'default_enabled': True,
},
'node_disconnect': {
'title': '{hostname}: Node disconnected',
'body': 'Node {node_name} has disconnected from the cluster.',
'label': 'Node disconnected',
'group': 'cluster',
'default_enabled': True,
},
'node_reconnect': {
'title': '{hostname}: Node reconnected',
'body': 'Node {node_name} has reconnected to the cluster.',
'label': 'Node reconnected',
'group': 'cluster',
'default_enabled': True,
},
# ── System events ──
# ── Services events ──
'system_shutdown': {
'title': '{hostname}: System shutting down',
'body': '{reason}',
'group': 'system',
'label': 'System shutting down',
'group': 'services',
'default_enabled': True,
},
'system_reboot': {
'title': '{hostname}: System rebooting',
'body': '{reason}',
'group': 'system',
'label': 'System rebooting',
'group': 'services',
'default_enabled': True,
},
'system_problem': {
'title': '{hostname}: System problem detected',
'body': '{reason}',
'group': 'system',
'label': 'System problem detected',
'group': 'services',
'default_enabled': True,
},
'service_fail': {
'title': '{hostname}: Service failed - {service_name}',
'body': '{reason}',
'group': 'system',
'label': 'Service failed',
'group': 'services',
'default_enabled': True,
},
'oom_kill': {
'title': '{hostname}: OOM Kill - {process}',
'body': '{reason}',
'label': 'Out of memory kill',
'group': 'services',
'default_enabled': True,
},
# ── Hidden internal templates (not shown in UI) ──
'service_fail_batch': {
'title': '{hostname}: {service_count} services failed',
'body': '{reason}',
'group': 'system',
'label': 'Service fail batch',
'group': 'services',
'default_enabled': True,
'hidden': True,
},
'system_mail': {
'title': '{hostname}: {pve_title}',
'body': '{reason}',
'group': 'system',
'label': 'PVE system mail',
'group': 'other',
'default_enabled': True,
'hidden': True,
},
'webhook_test': {
'title': '{hostname}: Webhook test received',
'body': 'PVE webhook connectivity test successful.\n{reason}',
'label': 'Webhook test',
'group': 'other',
'default_enabled': True,
'hidden': True,
},
'update_available': {
'title': '{hostname}: Updates available',
'body': 'Total updates: {total_count}\nSecurity: {security_count}\nProxmox: {pve_count}\nKernel: {kernel_count}\nImportant: {important_list}',
'group': 'system',
'default_enabled': False, # Superseded by update_summary
},
'update_complete': {
'title': '{hostname}: Update completed',
'body': '{details}',
'group': 'system',
'label': 'Updates available (legacy)',
'group': 'updates',
'default_enabled': False,
'hidden': True,
},
# ── Unknown persistent (from health monitor) ──
'unknown_persistent': {
'title': '{hostname}: Check unavailable - {category}',
'body': 'Health check for {category} has been unavailable for 3+ cycles.\n{reason}',
'group': 'system',
'label': 'Check unavailable',
'group': 'health',
'default_enabled': False,
'hidden': True,
},
# ── Persistent Health Issues (daily digest) ──
# ── Health Monitor events ──
'health_persistent': {
'title': '{hostname}: {count} active health issue(s)',
'body': 'The following health issues remain active:\n{issue_list}\n\nThis digest is sent once every 24 hours while issues persist.',
'group': 'system',
'label': 'Active health issues (daily)',
'group': 'health',
'default_enabled': True,
},
'health_issue_new': {
'title': '{hostname}: New health issue - {category}',
'body': 'New {severity} issue detected:\n{reason}',
'group': 'system',
'label': 'New health issue',
'group': 'health',
'default_enabled': True,
},
'health_issue_resolved': {
'title': '{hostname}: Resolved - {category}',
'body': '{category} issue has been resolved.\n{reason}\nDuration: {duration}',
'group': 'system',
'label': 'Health issue resolved',
'group': 'health',
'default_enabled': True,
},
# ── Update notifications (enriched) ──
# ── Update notifications ──
'update_summary': {
'title': '{hostname}: Updates available',
'body': (
@@ -680,80 +751,99 @@ TEMPLATES = {
'Kernel updates: {kernel_count}\n'
'Important packages: {important_list}'
),
'group': 'system',
'label': 'Updates available',
'group': 'updates',
'default_enabled': True,
},
'pve_update': {
'title': '{hostname}: Proxmox VE {new_version} available',
'body': 'Proxmox VE {current_version} -> {new_version}\n{details}',
'group': 'system',
'label': 'Proxmox VE update available',
'group': 'updates',
'default_enabled': True,
},
# ── PVE webhook test ──
'webhook_test': {
'title': '{hostname}: Webhook test received',
'body': 'PVE webhook connectivity test successful.\n{reason}',
'group': 'system',
'default_enabled': True,
'update_complete': {
'title': '{hostname}: Update completed',
'body': '{details}',
'label': 'Update completed',
'group': 'updates',
'default_enabled': False,
},
# ── Burst aggregation summaries ──
# ── Burst aggregation summaries (hidden -- auto-generated by BurstAggregator) ──
# These inherit enabled state from their parent event type at dispatch time.
'burst_auth_fail': {
'title': '{hostname}: {count} auth failures in {window}',
'body': '{count} authentication failures detected in {window}.\nSources: {entity_list}',
'label': 'Auth failures burst',
'group': 'security',
'default_enabled': True,
'hidden': True,
},
'burst_ip_block': {
'title': '{hostname}: Fail2Ban banned {count} IPs in {window}',
'body': '{count} IPs banned by Fail2Ban in {window}.\nIPs: {entity_list}',
'label': 'IP block burst',
'group': 'security',
'default_enabled': True,
'hidden': True,
},
'burst_disk_io': {
'title': '{hostname}: {count} disk I/O errors on {entity_list}',
'body': '{count} I/O errors detected in {window}.\nDevices: {entity_list}',
'label': 'Disk I/O burst',
'group': 'storage',
'default_enabled': True,
'hidden': True,
},
'burst_cluster': {
'title': '{hostname}: Cluster flapping detected ({count} changes)',
'body': 'Cluster state changed {count} times in {window}.\nNodes: {entity_list}',
'label': 'Cluster flapping burst',
'group': 'cluster',
'default_enabled': True,
'hidden': True,
},
'burst_service_fail': {
'title': '{hostname}: {count} services failed in {window}',
'body': '{count} service failures detected in {window}.\nThis typically indicates a node reboot or PVE service restart.\n\nAdditional failures:\n{details}',
'group': 'system',
'label': 'Service fail burst',
'group': 'services',
'default_enabled': True,
'hidden': True,
},
'burst_system': {
'title': '{hostname}: {count} system problems in {window}',
'body': '{count} system problems detected in {window}.\n\nAdditional issues:\n{details}',
'group': 'system',
'label': 'System problems burst',
'group': 'services',
'default_enabled': True,
'hidden': True,
},
'burst_generic': {
'title': '{hostname}: {count} {event_type} events in {window}',
'body': '{count} events of type {event_type} in {window}.\n\nAdditional events:\n{details}',
'group': 'system',
'label': 'Generic burst',
'group': 'other',
'default_enabled': True,
'hidden': True,
},
}
# ─── Event Groups (for UI filtering) ─────────────────────────────
EVENT_GROUPS = {
'system': {'label': 'System', 'description': 'System health, services, updates'},
'vm_ct': {'label': 'VM / CT', 'description': 'Virtual machines and containers'},
'backup': {'label': 'Backup', 'description': 'Backups and snapshots'},
'resources': {'label': 'Resources', 'description': 'CPU, memory, temperature, load'},
'storage': {'label': 'Storage', 'description': 'Disk space and I/O'},
'network': {'label': 'Network', 'description': 'Connectivity and latency'},
'security': {'label': 'Security', 'description': 'Authentication, firewall, bans'},
'cluster': {'label': 'Cluster', 'description': 'Cluster health and quorum'},
'vm_ct': {'label': 'VM / CT', 'description': 'Start, stop, crash, migration'},
'backup': {'label': 'Backups', 'description': 'Backup start, complete, fail'},
'resources': {'label': 'Resources', 'description': 'CPU, memory, temperature'},
'storage': {'label': 'Storage', 'description': 'Disk space, I/O, SMART'},
'network': {'label': 'Network', 'description': 'Connectivity, bond, latency'},
'security': {'label': 'Security', 'description': 'Auth failures, Fail2Ban, firewall'},
'cluster': {'label': 'Cluster', 'description': 'Quorum, split-brain, HA fencing'},
'services': {'label': 'Services', 'description': 'System services, shutdown, reboot'},
'health': {'label': 'Health Monitor', 'description': 'Health checks, degradation, recovery'},
'updates': {'label': 'Updates', 'description': 'System and PVE updates'},
'other': {'label': 'Other', 'description': 'Uncategorized notifications'},
}
@@ -777,14 +867,16 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]:
template = TEMPLATES.get(event_type)
if not template:
# Catch-all: unknown event types always get delivered (group 'other')
# so no Proxmox notification is ever silently dropped.
fallback_body = data.get('message', data.get('reason', str(data)))
severity = data.get('severity', 'INFO')
return {
'title': f"{_get_hostname()}: {event_type}",
'body': fallback_body, 'body_text': fallback_body,
'body_html': f'<p>{html_mod.escape(str(fallback_body))}</p>',
'fields': [], 'tags': [severity, 'system', event_type],
'severity': severity, 'group': 'system',
'fields': [], 'tags': [severity, 'other', event_type],
'severity': severity, 'group': 'other',
}
# Ensure hostname is always available
@@ -883,24 +975,36 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]:
def get_event_types_by_group() -> Dict[str, list]:
"""Get all event types organized by group, for UI rendering.
Hidden templates (burst aggregations, internal types) are excluded
from the UI. They still work in the backend and inherit enabled
state from their parent event type.
Returns:
{group_key: [{'type': event_type, 'title': template_title,
{group_key: [{'type': event_type, 'title': label,
'default_enabled': bool}, ...]}
"""
result = {}
for event_type, template in TEMPLATES.items():
group = template.get('group', 'system')
# Skip hidden templates (bursts, internal, deprecated)
if template.get('hidden', False):
continue
group = template.get('group', 'other')
if group not in result:
result[group] = []
import re
# Clean title: remove {hostname}: prefix and any remaining {placeholders}
title = template['title'].replace('{hostname}', '').strip(': ')
title = re.sub(r'\s*\{[^}]+\}', '', title).strip(' -:')
if not title:
title = event_type.replace('_', ' ').title()
# Use explicit label if available, otherwise derive from title
label = template.get('label', '')
if not label:
import re
label = template['title'].replace('{hostname}', '').strip(': ')
label = re.sub(r'\s*\{[^}]+\}', '', label).strip(' -:')
if not label:
label = event_type.replace('_', ' ').title()
result[group].append({
'type': event_type,
'title': title,
'title': label,
'default_enabled': template.get('default_enabled', True),
})
return result