mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-05 20:03:48 +00:00
Update notification service
This commit is contained in:
@@ -40,13 +40,18 @@ interface EventTypeInfo {
|
||||
default_enabled: boolean
|
||||
}
|
||||
|
||||
interface ChannelOverrides {
|
||||
categories: Record<string, boolean>
|
||||
events: Record<string, boolean>
|
||||
}
|
||||
|
||||
interface NotificationConfig {
|
||||
enabled: boolean
|
||||
channels: Record<string, ChannelConfig>
|
||||
severity_filter: string
|
||||
event_categories: Record<string, boolean>
|
||||
event_toggles: Record<string, boolean>
|
||||
event_types_by_group: Record<string, EventTypeInfo[]>
|
||||
channel_overrides: Record<string, ChannelOverrides>
|
||||
ai_enabled: boolean
|
||||
ai_provider: string
|
||||
ai_api_key: string
|
||||
@@ -79,23 +84,22 @@ interface HistoryEntry {
|
||||
error_message: string | null
|
||||
}
|
||||
|
||||
const SEVERITY_OPTIONS = [
|
||||
{ value: "critical", label: "Critical only" },
|
||||
{ value: "warning", label: "Warning + Critical" },
|
||||
{ value: "info", label: "All (Info + Warning + Critical)" },
|
||||
]
|
||||
|
||||
const EVENT_CATEGORIES = [
|
||||
{ key: "system", label: "System", desc: "Startup, shutdown, kernel events" },
|
||||
{ key: "vm_ct", label: "VM / CT", desc: "Start, stop, crash, migration" },
|
||||
{ key: "backup", label: "Backups", desc: "Backup start, complete, fail" },
|
||||
{ key: "resources", label: "Resources", desc: "CPU, memory, temperature" },
|
||||
{ key: "storage", label: "Storage", desc: "Disk space, I/O errors, SMART" },
|
||||
{ key: "storage", label: "Storage", desc: "Disk space, I/O, SMART" },
|
||||
{ key: "network", label: "Network", desc: "Connectivity, bond, latency" },
|
||||
{ key: "security", label: "Security", desc: "Auth failures, fail2ban, firewall" },
|
||||
{ key: "security", label: "Security", desc: "Auth failures, Fail2Ban, firewall" },
|
||||
{ key: "cluster", label: "Cluster", desc: "Quorum, split-brain, HA fencing" },
|
||||
{ key: "services", label: "Services", desc: "System services, shutdown, reboot" },
|
||||
{ key: "health", label: "Health Monitor", desc: "Health checks, degradation, recovery" },
|
||||
{ key: "updates", label: "Updates", desc: "System and PVE updates" },
|
||||
{ key: "other", label: "Other", desc: "Uncategorized notifications" },
|
||||
]
|
||||
|
||||
const CHANNEL_TYPES = ["telegram", "gotify", "discord", "email"] as const
|
||||
|
||||
const AI_PROVIDERS = [
|
||||
{ value: "openai", label: "OpenAI" },
|
||||
{ value: "groq", label: "Groq" },
|
||||
@@ -109,13 +113,19 @@ const DEFAULT_CONFIG: NotificationConfig = {
|
||||
discord: { enabled: false },
|
||||
email: { enabled: false },
|
||||
},
|
||||
severity_filter: "all",
|
||||
event_categories: {
|
||||
system: true, vm_ct: true, backup: true, resources: true,
|
||||
storage: true, network: true, security: true, cluster: true,
|
||||
vm_ct: true, backup: true, resources: true, storage: true,
|
||||
network: true, security: true, cluster: true, services: true,
|
||||
health: true, updates: true, other: true,
|
||||
},
|
||||
event_toggles: {},
|
||||
event_types_by_group: {},
|
||||
channel_overrides: {
|
||||
telegram: { categories: {}, events: {} },
|
||||
gotify: { categories: {}, events: {} },
|
||||
discord: { categories: {}, events: {} },
|
||||
email: { categories: {}, events: {} },
|
||||
},
|
||||
ai_enabled: false,
|
||||
ai_provider: "openai",
|
||||
ai_api_key: "",
|
||||
@@ -217,7 +227,6 @@ export function NotificationSettings() {
|
||||
const flattenConfig = (cfg: NotificationConfig): Record<string, string> => {
|
||||
const flat: Record<string, string> = {
|
||||
enabled: String(cfg.enabled),
|
||||
severity_filter: cfg.severity_filter,
|
||||
ai_enabled: String(cfg.ai_enabled),
|
||||
ai_provider: cfg.ai_provider,
|
||||
ai_api_key: cfg.ai_api_key,
|
||||
@@ -235,20 +244,17 @@ export function NotificationSettings() {
|
||||
flat[`${chName}.${field}`] = String(value ?? "")
|
||||
}
|
||||
}
|
||||
// Flatten event_categories: { system: true, backups: false } -> events.system, events.backups
|
||||
// Flatten global event_categories: { vm_ct: true, backup: false } -> events.vm_ct, events.backup
|
||||
for (const [cat, enabled] of Object.entries(cfg.event_categories)) {
|
||||
flat[`events.${cat}`] = String(enabled)
|
||||
}
|
||||
// Flatten event_toggles: { vm_start: true, vm_stop: false } -> event.vm_start, event.vm_stop
|
||||
// Always write ALL toggles to DB so the backend has an explicit record.
|
||||
// This ensures default_enabled changes in templates don't get overridden by stale DB values.
|
||||
// Flatten global event_toggles: { vm_start: true } -> event.vm_start
|
||||
if (cfg.event_toggles) {
|
||||
for (const [evt, enabled] of Object.entries(cfg.event_toggles)) {
|
||||
flat[`event.${evt}`] = String(enabled)
|
||||
}
|
||||
}
|
||||
// Also write any events NOT in event_toggles using their template defaults.
|
||||
// This covers newly added templates whose default_enabled may be false.
|
||||
// Write defaults for events NOT in toggles
|
||||
if (cfg.event_types_by_group) {
|
||||
for (const events of Object.values(cfg.event_types_by_group)) {
|
||||
for (const evt of (events as Array<{type: string, default_enabled: boolean}>)) {
|
||||
@@ -259,6 +265,21 @@ export function NotificationSettings() {
|
||||
}
|
||||
}
|
||||
}
|
||||
// Flatten per-channel overrides: telegram.events.backup, telegram.event.vm_start, etc.
|
||||
if (cfg.channel_overrides) {
|
||||
for (const [chName, overrides] of Object.entries(cfg.channel_overrides)) {
|
||||
if (overrides.categories) {
|
||||
for (const [cat, enabled] of Object.entries(overrides.categories)) {
|
||||
flat[`${chName}.events.${cat}`] = String(enabled)
|
||||
}
|
||||
}
|
||||
if (overrides.events) {
|
||||
for (const [evt, enabled] of Object.entries(overrides.events)) {
|
||||
flat[`${chName}.event.${evt}`] = String(enabled)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return flat
|
||||
}
|
||||
|
||||
@@ -1052,27 +1073,8 @@ matcher: proxmenux-pbs
|
||||
<span className="text-xs font-medium text-muted-foreground uppercase tracking-wider">Filters & Events</span>
|
||||
</div>
|
||||
<div className="rounded-lg border border-border/50 bg-muted/20 p-3 space-y-4">
|
||||
{/* Severity */}
|
||||
{/* Event Categories (global defaults -- per-channel overrides in Channel Filters below) */}
|
||||
<div className="space-y-1.5">
|
||||
<Label className="text-[11px] text-muted-foreground">Severity Filter</Label>
|
||||
<Select
|
||||
value={config.severity_filter}
|
||||
onValueChange={v => updateConfig(p => ({ ...p, severity_filter: v }))}
|
||||
disabled={!editMode}
|
||||
>
|
||||
<SelectTrigger className={`h-8 text-xs ${!editMode ? "opacity-60" : ""}`}>
|
||||
<SelectValue />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
{SEVERITY_OPTIONS.map(opt => (
|
||||
<SelectItem key={opt.value} value={opt.value}>{opt.label}</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
|
||||
{/* Event Categories */}
|
||||
<div className="space-y-1.5 border-t border-border/30 pt-3">
|
||||
<Label className="text-[11px] text-muted-foreground">Event Categories</Label>
|
||||
<div className="space-y-1.5">
|
||||
{EVENT_CATEGORIES.map(cat => {
|
||||
@@ -1198,6 +1200,118 @@ matcher: proxmenux-pbs
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Per-channel overrides */}
|
||||
<div className="space-y-2 border-t border-border/30 pt-3">
|
||||
<Label className="text-[11px] text-muted-foreground">Channel Filters</Label>
|
||||
<p className="text-[10px] text-muted-foreground leading-relaxed">
|
||||
By default every channel inherits the global settings above. Override specific categories per channel to customize what each destination receives.
|
||||
</p>
|
||||
<div className="space-y-2">
|
||||
{CHANNEL_TYPES.map(chName => {
|
||||
const chEnabled = config.channels[chName]?.enabled
|
||||
if (!chEnabled) return null
|
||||
const overrides = config.channel_overrides?.[chName] || { categories: {}, events: {} }
|
||||
const hasOverrides = Object.keys(overrides.categories).length > 0
|
||||
const chLabel = chName === "email" ? "Email" : chName.charAt(0).toUpperCase() + chName.slice(1)
|
||||
const chColor = chName === "telegram" ? "blue" : chName === "gotify" ? "green" : chName === "discord" ? "indigo" : "amber"
|
||||
|
||||
return (
|
||||
<details key={chName} className="group">
|
||||
<summary className={`flex items-center justify-between text-[11px] font-medium cursor-pointer hover:text-foreground transition-colors py-1.5 px-2 rounded-md hover:bg-muted/50 ${
|
||||
hasOverrides ? `text-${chColor}-400` : "text-muted-foreground"
|
||||
}`}>
|
||||
<div className="flex items-center gap-2">
|
||||
<ChevronDown className="h-3 w-3 group-open:rotate-180 transition-transform" />
|
||||
<span>{chLabel}</span>
|
||||
{hasOverrides && (
|
||||
<span className={`text-[9px] px-1.5 py-0.5 rounded-full bg-${chColor}-500/15 text-${chColor}-400`}>
|
||||
customized
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
{!hasOverrides && (
|
||||
<span className="text-[9px] text-muted-foreground/60">inherits global</span>
|
||||
)}
|
||||
</summary>
|
||||
<div className="mt-1.5 ml-5 space-y-1">
|
||||
{EVENT_CATEGORIES.map(cat => {
|
||||
const globalEnabled = config.event_categories[cat.key] ?? true
|
||||
const override = overrides.categories[cat.key]
|
||||
const isCustomized = override !== undefined
|
||||
const effectiveEnabled = isCustomized ? override : globalEnabled
|
||||
|
||||
return (
|
||||
<div key={cat.key} className="flex items-center justify-between py-1 px-2 rounded hover:bg-muted/30">
|
||||
<div className="flex items-center gap-2">
|
||||
<span className={`text-[11px] ${effectiveEnabled ? "text-foreground" : "text-muted-foreground/50"}`}>
|
||||
{cat.label}
|
||||
</span>
|
||||
{!isCustomized && (
|
||||
<span className="text-[9px] text-muted-foreground/40">global</span>
|
||||
)}
|
||||
</div>
|
||||
<div className="flex items-center gap-1.5">
|
||||
{isCustomized && (
|
||||
<button
|
||||
type="button"
|
||||
className="text-[9px] text-muted-foreground hover:text-foreground px-1"
|
||||
disabled={!editMode}
|
||||
onClick={() => {
|
||||
if (!editMode) return
|
||||
updateConfig(p => {
|
||||
const ch = { ...(p.channel_overrides?.[chName] || { categories: {}, events: {} }) }
|
||||
const cats = { ...ch.categories }
|
||||
delete cats[cat.key]
|
||||
return { ...p, channel_overrides: { ...p.channel_overrides, [chName]: { ...ch, categories: cats } } }
|
||||
})
|
||||
}}
|
||||
>
|
||||
reset
|
||||
</button>
|
||||
)}
|
||||
<button
|
||||
type="button"
|
||||
role="switch"
|
||||
aria-checked={effectiveEnabled}
|
||||
disabled={!editMode}
|
||||
className={`relative inline-flex h-3.5 w-6 shrink-0 items-center rounded-full transition-colors ${
|
||||
!editMode ? "opacity-50 cursor-not-allowed" : "cursor-pointer"
|
||||
} ${effectiveEnabled ? `bg-${chColor}-600` : "bg-muted-foreground/30"}`}
|
||||
onClick={() => {
|
||||
if (!editMode) return
|
||||
updateConfig(p => {
|
||||
const ch = { ...(p.channel_overrides?.[chName] || { categories: {}, events: {} }) }
|
||||
return {
|
||||
...p,
|
||||
channel_overrides: {
|
||||
...p.channel_overrides,
|
||||
[chName]: { ...ch, categories: { ...ch.categories, [cat.key]: !effectiveEnabled } }
|
||||
}
|
||||
}
|
||||
})
|
||||
}}
|
||||
>
|
||||
<span className={`pointer-events-none block h-2.5 w-2.5 rounded-full bg-background shadow-sm transition-transform ${
|
||||
effectiveEnabled ? "translate-x-3" : "translate-x-0.5"
|
||||
}`} />
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
})}
|
||||
</div>
|
||||
</details>
|
||||
)
|
||||
})}
|
||||
{CHANNEL_TYPES.every(ch => !config.channels[ch]?.enabled) && (
|
||||
<p className="text-[10px] text-muted-foreground/50 italic py-2">
|
||||
Enable at least one channel above to configure per-channel filters.
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>{/* close bordered filters container */}
|
||||
</div>
|
||||
|
||||
|
||||
@@ -575,13 +575,31 @@ def _temperature_collector_loop():
|
||||
|
||||
def _health_collector_loop():
|
||||
"""Background thread: run full health checks every 5 minutes.
|
||||
Keeps the health cache always fresh and records events/errors in the DB
|
||||
so the future notification service can consume them."""
|
||||
Keeps the health cache always fresh and records events/errors in the DB.
|
||||
Also emits notifications when a health category degrades (OK -> WARNING/CRITICAL)."""
|
||||
from health_monitor import health_monitor
|
||||
|
||||
# Wait 30s after startup to let other services initialize
|
||||
time.sleep(30)
|
||||
|
||||
# Track previous status per category to detect transitions
|
||||
_prev_statuses = {}
|
||||
# Severity ranking for comparison
|
||||
_SEV_RANK = {'OK': 0, 'INFO': 0, 'UNKNOWN': 1, 'WARNING': 2, 'CRITICAL': 3}
|
||||
# Human-readable category names
|
||||
_CAT_NAMES = {
|
||||
'cpu': 'CPU Usage & Temperature',
|
||||
'memory': 'Memory & Swap',
|
||||
'storage': 'Storage Mounts & Space',
|
||||
'disks': 'Disk I/O & Errors',
|
||||
'network': 'Network Interfaces',
|
||||
'vms': 'VMs & Containers',
|
||||
'services': 'PVE Services',
|
||||
'logs': 'System Logs',
|
||||
'updates': 'System Updates',
|
||||
'security': 'Security',
|
||||
}
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Run full health check (results get cached internally + recorded in DB)
|
||||
@@ -598,6 +616,64 @@ def _health_collector_loop():
|
||||
health_monitor.cached_results['_bg_detailed'] = result
|
||||
health_monitor.last_check_times['_bg_overall'] = time.time()
|
||||
health_monitor.last_check_times['_bg_detailed'] = time.time()
|
||||
|
||||
# ── Health degradation notifications ──
|
||||
# Compare each category's current status to previous cycle.
|
||||
# Notify when a category DEGRADES (OK->WARNING, WARNING->CRITICAL, etc.)
|
||||
# Include the detailed 'reason' so the user knows exactly what triggered it.
|
||||
details = result.get('details', {})
|
||||
degraded = []
|
||||
|
||||
for cat_key, cat_data in details.items():
|
||||
cur_status = cat_data.get('status', 'OK')
|
||||
prev_status = _prev_statuses.get(cat_key, 'OK')
|
||||
cur_rank = _SEV_RANK.get(cur_status, 0)
|
||||
prev_rank = _SEV_RANK.get(prev_status, 0)
|
||||
|
||||
if cur_rank > prev_rank and cur_rank >= 2: # WARNING or CRITICAL
|
||||
reason = cat_data.get('reason', f'{cat_key} status changed to {cur_status}')
|
||||
cat_name = _CAT_NAMES.get(cat_key, cat_key)
|
||||
degraded.append({
|
||||
'category': cat_name,
|
||||
'status': cur_status,
|
||||
'reason': reason,
|
||||
})
|
||||
|
||||
_prev_statuses[cat_key] = cur_status
|
||||
|
||||
# Send grouped notification if any categories degraded
|
||||
if degraded and notification_manager._enabled:
|
||||
hostname = result.get('hostname', '')
|
||||
if not hostname:
|
||||
import socket as _sock
|
||||
hostname = _sock.gethostname()
|
||||
|
||||
if len(degraded) == 1:
|
||||
d = degraded[0]
|
||||
title = f"{hostname}: Health {d['status']} - {d['category']}"
|
||||
body = d['reason']
|
||||
severity = d['status']
|
||||
else:
|
||||
# Multiple categories degraded at once -- group them
|
||||
max_sev = max(degraded, key=lambda x: _SEV_RANK.get(x['status'], 0))['status']
|
||||
title = f"{hostname}: {len(degraded)} health checks degraded"
|
||||
lines = []
|
||||
for d in degraded:
|
||||
lines.append(f" [{d['status']}] {d['category']}: {d['reason']}")
|
||||
body = '\n'.join(lines)
|
||||
severity = max_sev
|
||||
|
||||
try:
|
||||
notification_manager.send_notification(
|
||||
event_type='health_degraded',
|
||||
severity=severity,
|
||||
title=title,
|
||||
message=body,
|
||||
data={'hostname': hostname, 'count': str(len(degraded))},
|
||||
source='health_monitor',
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"[ProxMenux] Health notification error: {e}")
|
||||
except Exception as e:
|
||||
print(f"[ProxMenux] Health collector error: {e}")
|
||||
|
||||
|
||||
@@ -2778,24 +2778,41 @@ class HealthMonitor:
|
||||
return 'INFO'
|
||||
return severity
|
||||
|
||||
# Build detail strings that include the actual error samples
|
||||
# so the user can see exactly WHAT is triggering the warning.
|
||||
if cascade_count > 0:
|
||||
cascade_detail = f'{cascade_count} pattern(s) repeating >=15 times: ' + '; '.join(cascade_samples)
|
||||
else:
|
||||
cascade_detail = 'No cascading errors'
|
||||
|
||||
if spike_count > 0:
|
||||
spike_detail = f'{spike_count} pattern(s) with 4x increase: ' + '; '.join(spike_samples)
|
||||
else:
|
||||
spike_detail = 'No error spikes'
|
||||
|
||||
if persistent_count > 0:
|
||||
persist_detail = f'{persistent_count} recurring pattern(s) over 15+ min: ' + '; '.join(persist_samples)
|
||||
else:
|
||||
persist_detail = 'No persistent patterns'
|
||||
|
||||
log_checks = {
|
||||
'log_error_cascade': {
|
||||
'status': _log_check_status('log_error_cascade', cascade_count > 0, 'WARNING'),
|
||||
'detail': f'{cascade_count} pattern(s) repeating >=15 times' if cascade_count > 0 else 'No cascading errors',
|
||||
'detail': cascade_detail,
|
||||
'dismissable': True,
|
||||
'dismissed': 'log_error_cascade' in dismissed_keys,
|
||||
'error_key': 'log_error_cascade'
|
||||
},
|
||||
'log_error_spike': {
|
||||
'status': _log_check_status('log_error_spike', spike_count > 0, 'WARNING'),
|
||||
'detail': f'{spike_count} pattern(s) with 4x increase' if spike_count > 0 else 'No error spikes',
|
||||
'detail': spike_detail,
|
||||
'dismissable': True,
|
||||
'dismissed': 'log_error_spike' in dismissed_keys,
|
||||
'error_key': 'log_error_spike'
|
||||
},
|
||||
'log_persistent_errors': {
|
||||
'status': _log_check_status('log_persistent_errors', persistent_count > 0, 'WARNING'),
|
||||
'detail': f'{persistent_count} recurring pattern(s) over 15+ min' if persistent_count > 0 else 'No persistent patterns',
|
||||
'detail': persist_detail,
|
||||
'dismissable': True,
|
||||
'dismissed': 'log_persistent_errors' in dismissed_keys,
|
||||
'error_key': 'log_persistent_errors'
|
||||
|
||||
@@ -69,9 +69,15 @@ GROUP_RATE_LIMITS = {
|
||||
'resources': {'max_per_minute': 3, 'max_per_hour': 20},
|
||||
'vm_ct': {'max_per_minute': 10, 'max_per_hour': 60},
|
||||
'backup': {'max_per_minute': 5, 'max_per_hour': 30},
|
||||
'system': {'max_per_minute': 5, 'max_per_hour': 30},
|
||||
'services': {'max_per_minute': 5, 'max_per_hour': 30},
|
||||
'health': {'max_per_minute': 3, 'max_per_hour': 20},
|
||||
'updates': {'max_per_minute': 3, 'max_per_hour': 15},
|
||||
'other': {'max_per_minute': 5, 'max_per_hour': 30},
|
||||
}
|
||||
|
||||
# Default fallback for unknown groups
|
||||
_DEFAULT_RATE_LIMIT = {'max_per_minute': 5, 'max_per_hour': 30}
|
||||
|
||||
|
||||
class GroupRateLimiter:
|
||||
"""Rate limiter per event group. Prevents notification storms."""
|
||||
@@ -84,7 +90,7 @@ class GroupRateLimiter:
|
||||
|
||||
def allow(self, group: str) -> bool:
|
||||
"""Check if group rate limit allows this event."""
|
||||
limits = GROUP_RATE_LIMITS.get(group, GROUP_RATE_LIMITS['system'])
|
||||
limits = GROUP_RATE_LIMITS.get(group, _DEFAULT_RATE_LIMIT)
|
||||
now = time.time()
|
||||
|
||||
# Initialize if needed
|
||||
@@ -554,35 +560,28 @@ class NotificationManager:
|
||||
print(f"[NotificationManager] Aggregation flush error: {e}")
|
||||
|
||||
def _process_event(self, event: NotificationEvent):
|
||||
"""Process a single event: filter -> aggregate -> cooldown -> rate limit -> dispatch."""
|
||||
"""Process a single event: filter -> aggregate -> cooldown -> rate limit -> dispatch.
|
||||
|
||||
NOTE: Group and per-event filters are checked globally here.
|
||||
Per-channel overrides are applied later in _dispatch_to_channels().
|
||||
"""
|
||||
if not self._enabled:
|
||||
return
|
||||
|
||||
# Check if this event's GROUP is enabled in settings.
|
||||
# The UI saves categories by group key: events.vm_ct, events.backup, etc.
|
||||
# Check if this event's GROUP is enabled globally.
|
||||
template = TEMPLATES.get(event.event_type, {})
|
||||
event_group = template.get('group', 'system')
|
||||
event_group = template.get('group', 'other')
|
||||
group_setting = f'events.{event_group}'
|
||||
if self._config.get(group_setting, 'true') == 'false':
|
||||
return
|
||||
|
||||
# Check if this SPECIFIC event type is enabled (granular per-event toggle).
|
||||
# Key format: event.{event_type} = "true"/"false"
|
||||
# Check if this SPECIFIC event type is enabled globally.
|
||||
# Default comes from the template's default_enabled field.
|
||||
default_enabled = 'true' if template.get('default_enabled', True) else 'false'
|
||||
event_specific = f'event.{event.event_type}'
|
||||
if self._config.get(event_specific, default_enabled) == 'false':
|
||||
return
|
||||
|
||||
# Check severity filter.
|
||||
# The UI saves severity_filter as: "all", "warning", "critical".
|
||||
# Map to our internal severity names for comparison.
|
||||
severity_map = {'all': 'INFO', 'warning': 'WARNING', 'critical': 'CRITICAL'}
|
||||
raw_filter = self._config.get('severity_filter', 'all')
|
||||
min_severity = severity_map.get(raw_filter.lower(), 'INFO')
|
||||
if not self._meets_severity(event.severity, min_severity):
|
||||
return
|
||||
|
||||
# Try aggregation (may buffer the event)
|
||||
result = self._aggregator.ingest(event)
|
||||
if result is None:
|
||||
@@ -593,30 +592,23 @@ class NotificationManager:
|
||||
self._dispatch_event(event)
|
||||
|
||||
def _process_event_direct(self, event: NotificationEvent):
|
||||
"""Process a burst summary event. Bypasses aggregator but applies ALL other filters."""
|
||||
"""Process a burst summary event. Bypasses aggregator but applies global filters."""
|
||||
if not self._enabled:
|
||||
return
|
||||
|
||||
# Check group filter (same as _process_event)
|
||||
# Check group filter
|
||||
template = TEMPLATES.get(event.event_type, {})
|
||||
event_group = template.get('group', 'system')
|
||||
event_group = template.get('group', 'other')
|
||||
group_setting = f'events.{event_group}'
|
||||
if self._config.get(group_setting, 'true') == 'false':
|
||||
return
|
||||
|
||||
# Check per-event filter (same as _process_event)
|
||||
# Check per-event filter
|
||||
default_enabled = 'true' if template.get('default_enabled', True) else 'false'
|
||||
event_specific = f'event.{event.event_type}'
|
||||
if self._config.get(event_specific, default_enabled) == 'false':
|
||||
return
|
||||
|
||||
# Check severity filter (same mapping as _process_event)
|
||||
severity_map = {'all': 'INFO', 'warning': 'WARNING', 'critical': 'CRITICAL'}
|
||||
raw_filter = self._config.get('severity_filter', 'all')
|
||||
min_severity = severity_map.get(raw_filter.lower(), 'INFO')
|
||||
if not self._meets_severity(event.severity, min_severity):
|
||||
return
|
||||
|
||||
self._dispatch_event(event)
|
||||
|
||||
def _dispatch_event(self, event: NotificationEvent):
|
||||
@@ -636,7 +628,7 @@ class NotificationManager:
|
||||
|
||||
# Check group rate limit
|
||||
template = TEMPLATES.get(event.event_type, {})
|
||||
group = template.get('group', 'system')
|
||||
group = template.get('group', 'other')
|
||||
if not self._group_limiter.allow(group):
|
||||
return
|
||||
|
||||
@@ -674,11 +666,33 @@ class NotificationManager:
|
||||
|
||||
def _dispatch_to_channels(self, title: str, body: str, severity: str,
|
||||
event_type: str, data: Dict, source: str):
|
||||
"""Send notification through all configured channels."""
|
||||
"""Send notification through configured channels, respecting per-channel overrides.
|
||||
|
||||
Each channel can override global category/event settings:
|
||||
- {channel}.events.{group} = "true"/"false" (category override)
|
||||
- {channel}.event.{type} = "true"/"false" (per-event override)
|
||||
If no override exists, the channel inherits the global setting (already checked).
|
||||
"""
|
||||
with self._lock:
|
||||
channels = dict(self._channels)
|
||||
|
||||
template = TEMPLATES.get(event_type, {})
|
||||
event_group = template.get('group', 'other')
|
||||
|
||||
for ch_name, channel in channels.items():
|
||||
# ── Per-channel override check ──
|
||||
# If the channel has an explicit override for this group or event, respect it.
|
||||
# If no override, the global filter already passed (checked in _process_event).
|
||||
ch_group_key = f'{ch_name}.events.{event_group}'
|
||||
ch_group_override = self._config.get(ch_group_key)
|
||||
if ch_group_override == 'false':
|
||||
continue # Channel explicitly disabled this category
|
||||
|
||||
ch_event_key = f'{ch_name}.event.{event_type}'
|
||||
ch_event_override = self._config.get(ch_event_key)
|
||||
if ch_event_override == 'false':
|
||||
continue # Channel explicitly disabled this event
|
||||
|
||||
try:
|
||||
result = channel.send(title, body, severity, data)
|
||||
self._record_history(
|
||||
@@ -857,12 +871,6 @@ class NotificationManager:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _meets_severity(event_severity: str, min_severity: str) -> bool:
|
||||
"""Check if event severity meets the minimum threshold."""
|
||||
levels = {'INFO': 0, 'WARNING': 1, 'CRITICAL': 2}
|
||||
return levels.get(event_severity, 0) >= levels.get(min_severity, 0)
|
||||
|
||||
# ─── History Recording ──────────────────────────────────────
|
||||
|
||||
def _record_history(self, event_type: str, channel: str, title: str,
|
||||
@@ -1171,7 +1179,7 @@ class NotificationManager:
|
||||
channels[ch_type] = ch_cfg
|
||||
|
||||
# Build event_categories dict (group-level toggle)
|
||||
# EVENT_GROUPS is a dict: { 'system': {...}, 'vm_ct': {...}, ... }
|
||||
# EVENT_GROUPS is a dict: { 'vm_ct': {...}, 'services': {...}, 'health': {...}, ... }
|
||||
event_categories = {}
|
||||
for group_key in EVENT_GROUPS:
|
||||
event_categories[group_key] = self._config.get(f'events.{group_key}', 'true') == 'true'
|
||||
@@ -1189,13 +1197,28 @@ class NotificationManager:
|
||||
# Build event_types_by_group for UI rendering
|
||||
event_types_by_group = get_event_types_by_group()
|
||||
|
||||
# Build per-channel overrides
|
||||
# Keys: {channel}.events.{group} and {channel}.event.{event_type}
|
||||
channel_overrides = {}
|
||||
for ch_type in CHANNEL_TYPES:
|
||||
ch_overrides = {'categories': {}, 'events': {}}
|
||||
for group_key in EVENT_GROUPS:
|
||||
val = self._config.get(f'{ch_type}.events.{group_key}')
|
||||
if val is not None:
|
||||
ch_overrides['categories'][group_key] = val == 'true'
|
||||
for event_type_key in TEMPLATES:
|
||||
val = self._config.get(f'{ch_type}.event.{event_type_key}')
|
||||
if val is not None:
|
||||
ch_overrides['events'][event_type_key] = val == 'true'
|
||||
channel_overrides[ch_type] = ch_overrides
|
||||
|
||||
config = {
|
||||
'enabled': self._enabled,
|
||||
'channels': channels,
|
||||
'severity_filter': self._config.get('severity_filter', 'all'),
|
||||
'event_categories': event_categories,
|
||||
'event_toggles': event_toggles,
|
||||
'event_types_by_group': event_types_by_group,
|
||||
'channel_overrides': channel_overrides,
|
||||
'ai_enabled': self._config.get('ai_enabled', 'false') == 'true',
|
||||
'ai_provider': self._config.get('ai_provider', 'openai'),
|
||||
'ai_api_key': self._config.get('ai_api_key', ''),
|
||||
|
||||
@@ -342,25 +342,36 @@ TEMPLATES = {
|
||||
'state_change': {
|
||||
'title': '{hostname}: {category} changed to {current}',
|
||||
'body': '{category} status changed from {previous} to {current}.\n{reason}',
|
||||
'group': 'system',
|
||||
'label': 'Health state changed',
|
||||
'group': 'health',
|
||||
'default_enabled': False,
|
||||
},
|
||||
'new_error': {
|
||||
'title': '{hostname}: New {severity} - {category}',
|
||||
'body': '{reason}',
|
||||
'group': 'system',
|
||||
'label': 'New health issue',
|
||||
'group': 'health',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'error_resolved': {
|
||||
'title': '{hostname}: Resolved - {category}',
|
||||
'body': '{reason}\nDuration: {duration}',
|
||||
'group': 'system',
|
||||
'label': 'Health issue resolved',
|
||||
'group': 'health',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'error_escalated': {
|
||||
'title': '{hostname}: Escalated to {severity} - {category}',
|
||||
'body': '{reason}',
|
||||
'group': 'system',
|
||||
'label': 'Health issue escalated',
|
||||
'group': 'health',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'health_degraded': {
|
||||
'title': '{hostname}: Health check degraded',
|
||||
'body': '{reason}',
|
||||
'label': 'Health check degraded',
|
||||
'group': 'health',
|
||||
'default_enabled': True,
|
||||
},
|
||||
|
||||
@@ -368,90 +379,105 @@ TEMPLATES = {
|
||||
'vm_start': {
|
||||
'title': '{hostname}: VM {vmid} started',
|
||||
'body': '{vmname} ({vmid}) has been started.',
|
||||
'label': 'VM started',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'vm_stop': {
|
||||
'title': '{hostname}: VM {vmid} stopped',
|
||||
'body': '{vmname} ({vmid}) has been stopped.',
|
||||
'label': 'VM stopped',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': False,
|
||||
},
|
||||
'vm_shutdown': {
|
||||
'title': '{hostname}: VM {vmid} shutdown',
|
||||
'body': '{vmname} ({vmid}) has been shut down.',
|
||||
'label': 'VM shutdown',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': False,
|
||||
},
|
||||
'vm_fail': {
|
||||
'title': '{hostname}: VM {vmid} FAILED',
|
||||
'body': '{vmname} ({vmid}) has failed.\n{reason}',
|
||||
'label': 'VM FAILED',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'vm_restart': {
|
||||
'title': '{hostname}: VM {vmid} restarted',
|
||||
'body': '{vmname} ({vmid}) has been restarted.',
|
||||
'label': 'VM restarted',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': False,
|
||||
},
|
||||
'ct_start': {
|
||||
'title': '{hostname}: CT {vmid} started',
|
||||
'body': '{vmname} ({vmid}) has been started.',
|
||||
'label': 'CT started',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'ct_stop': {
|
||||
'title': '{hostname}: CT {vmid} stopped',
|
||||
'body': '{vmname} ({vmid}) has been stopped.',
|
||||
'label': 'CT stopped',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': False,
|
||||
},
|
||||
'ct_shutdown': {
|
||||
'title': '{hostname}: CT {vmid} shutdown',
|
||||
'body': '{vmname} ({vmid}) has been shut down.',
|
||||
'label': 'CT shutdown',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': False,
|
||||
},
|
||||
'ct_restart': {
|
||||
'title': '{hostname}: CT {vmid} restarted',
|
||||
'body': '{vmname} ({vmid}) has been restarted.',
|
||||
'label': 'CT restarted',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': False,
|
||||
},
|
||||
'ct_fail': {
|
||||
'title': '{hostname}: CT {vmid} FAILED',
|
||||
'body': '{vmname} ({vmid}) has failed.\n{reason}',
|
||||
'label': 'CT FAILED',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'migration_start': {
|
||||
'title': '{hostname}: Migration started - {vmid}',
|
||||
'body': '{vmname} ({vmid}) migration to {target_node} started.',
|
||||
'label': 'Migration started',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'migration_complete': {
|
||||
'title': '{hostname}: Migration complete - {vmid}',
|
||||
'body': '{vmname} ({vmid}) migrated successfully to {target_node}.',
|
||||
'label': 'Migration complete',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'migration_fail': {
|
||||
'title': '{hostname}: Migration FAILED - {vmid}',
|
||||
'body': '{vmname} ({vmid}) migration to {target_node} failed.\n{reason}',
|
||||
'label': 'Migration FAILED',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'replication_fail': {
|
||||
'title': '{hostname}: Replication FAILED - {vmid}',
|
||||
'body': 'Replication of {vmname} ({vmid}) has failed.\n{reason}',
|
||||
'label': 'Replication FAILED',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'replication_complete': {
|
||||
'title': '{hostname}: Replication complete - {vmid}',
|
||||
'body': 'Replication of {vmname} ({vmid}) completed successfully.',
|
||||
'label': 'Replication complete',
|
||||
'group': 'vm_ct',
|
||||
'default_enabled': False,
|
||||
},
|
||||
@@ -460,30 +486,35 @@ TEMPLATES = {
|
||||
'backup_start': {
|
||||
'title': '{hostname}: Backup started',
|
||||
'body': '{reason}',
|
||||
'label': 'Backup started',
|
||||
'group': 'backup',
|
||||
'default_enabled': False,
|
||||
},
|
||||
'backup_complete': {
|
||||
'title': '{hostname}: Backup complete - {vmid}',
|
||||
'body': 'Backup of {vmname} ({vmid}) completed successfully.\nSize: {size}',
|
||||
'label': 'Backup complete',
|
||||
'group': 'backup',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'backup_fail': {
|
||||
'title': '{hostname}: Backup FAILED - {vmid}',
|
||||
'body': 'Backup of {vmname} ({vmid}) has failed.\n{reason}',
|
||||
'label': 'Backup FAILED',
|
||||
'group': 'backup',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'snapshot_complete': {
|
||||
'title': '{hostname}: Snapshot created - {vmid}',
|
||||
'body': 'Snapshot of {vmname} ({vmid}) created: {snapshot_name}',
|
||||
'label': 'Snapshot created',
|
||||
'group': 'backup',
|
||||
'default_enabled': False,
|
||||
},
|
||||
'snapshot_fail': {
|
||||
'title': '{hostname}: Snapshot FAILED - {vmid}',
|
||||
'body': 'Snapshot of {vmname} ({vmid}) failed.\n{reason}',
|
||||
'label': 'Snapshot FAILED',
|
||||
'group': 'backup',
|
||||
'default_enabled': True,
|
||||
},
|
||||
@@ -492,42 +523,49 @@ TEMPLATES = {
|
||||
'cpu_high': {
|
||||
'title': '{hostname}: High CPU usage ({value}%)',
|
||||
'body': 'CPU usage is at {value}% on {cores} cores.\n{details}',
|
||||
'label': 'High CPU usage',
|
||||
'group': 'resources',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'ram_high': {
|
||||
'title': '{hostname}: High memory usage ({value}%)',
|
||||
'body': 'Memory usage: {used} / {total} ({value}%).\n{details}',
|
||||
'label': 'High memory usage',
|
||||
'group': 'resources',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'temp_high': {
|
||||
'title': '{hostname}: High temperature ({value}C)',
|
||||
'body': 'CPU temperature: {value}C (threshold: {threshold}C).\n{details}',
|
||||
'label': 'High temperature',
|
||||
'group': 'resources',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'disk_space_low': {
|
||||
'title': '{hostname}: Low disk space on {mount}',
|
||||
'body': '{mount}: {used}% used ({available} available).',
|
||||
'label': 'Low disk space',
|
||||
'group': 'storage',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'disk_io_error': {
|
||||
'title': '{hostname}: Disk failure detected on {device}',
|
||||
'body': '{reason}',
|
||||
'label': 'Disk failure / I/O error',
|
||||
'group': 'storage',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'storage_unavailable': {
|
||||
'title': '{hostname}: Storage unavailable - {storage_name}',
|
||||
'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}',
|
||||
'label': 'Storage unavailable',
|
||||
'group': 'storage',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'load_high': {
|
||||
'title': '{hostname}: High system load ({value})',
|
||||
'body': 'System load average: {value} on {cores} cores.\n{details}',
|
||||
'label': 'High system load',
|
||||
'group': 'resources',
|
||||
'default_enabled': True,
|
||||
},
|
||||
@@ -536,12 +574,14 @@ TEMPLATES = {
|
||||
'network_down': {
|
||||
'title': '{hostname}: Network connectivity lost',
|
||||
'body': 'Network connectivity check failed.\n{reason}',
|
||||
'label': 'Network connectivity lost',
|
||||
'group': 'network',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'network_latency': {
|
||||
'title': '{hostname}: High network latency ({value}ms)',
|
||||
'body': 'Latency to gateway: {value}ms (threshold: {threshold}ms).',
|
||||
'label': 'High network latency',
|
||||
'group': 'network',
|
||||
'default_enabled': False,
|
||||
},
|
||||
@@ -550,24 +590,28 @@ TEMPLATES = {
|
||||
'auth_fail': {
|
||||
'title': '{hostname}: Authentication failure',
|
||||
'body': 'Failed login attempt from {source_ip}.\nUser: {username}\nService: {service}',
|
||||
'label': 'Authentication failure',
|
||||
'group': 'security',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'ip_block': {
|
||||
'title': '{hostname}: IP blocked by Fail2Ban',
|
||||
'body': 'IP {source_ip} has been banned.\nJail: {jail}\nFailures: {failures}',
|
||||
'label': 'IP blocked by Fail2Ban',
|
||||
'group': 'security',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'firewall_issue': {
|
||||
'title': '{hostname}: Firewall issue detected',
|
||||
'body': '{reason}',
|
||||
'label': 'Firewall issue detected',
|
||||
'group': 'security',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'user_permission_change': {
|
||||
'title': '{hostname}: User permission changed',
|
||||
'body': 'User: {username}\nChange: {change_details}',
|
||||
'label': 'User permission changed',
|
||||
'group': 'security',
|
||||
'default_enabled': True,
|
||||
},
|
||||
@@ -576,101 +620,128 @@ TEMPLATES = {
|
||||
'split_brain': {
|
||||
'title': '{hostname}: SPLIT-BRAIN detected',
|
||||
'body': 'Cluster split-brain condition detected.\nQuorum status: {quorum}',
|
||||
'label': 'SPLIT-BRAIN detected',
|
||||
'group': 'cluster',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'node_disconnect': {
|
||||
'title': '{hostname}: Node disconnected',
|
||||
'body': 'Node {node_name} has disconnected from the cluster.',
|
||||
'label': 'Node disconnected',
|
||||
'group': 'cluster',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'node_reconnect': {
|
||||
'title': '{hostname}: Node reconnected',
|
||||
'body': 'Node {node_name} has reconnected to the cluster.',
|
||||
'label': 'Node reconnected',
|
||||
'group': 'cluster',
|
||||
'default_enabled': True,
|
||||
},
|
||||
|
||||
# ── System events ──
|
||||
# ── Services events ──
|
||||
'system_shutdown': {
|
||||
'title': '{hostname}: System shutting down',
|
||||
'body': '{reason}',
|
||||
'group': 'system',
|
||||
'label': 'System shutting down',
|
||||
'group': 'services',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'system_reboot': {
|
||||
'title': '{hostname}: System rebooting',
|
||||
'body': '{reason}',
|
||||
'group': 'system',
|
||||
'label': 'System rebooting',
|
||||
'group': 'services',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'system_problem': {
|
||||
'title': '{hostname}: System problem detected',
|
||||
'body': '{reason}',
|
||||
'group': 'system',
|
||||
'label': 'System problem detected',
|
||||
'group': 'services',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'service_fail': {
|
||||
'title': '{hostname}: Service failed - {service_name}',
|
||||
'body': '{reason}',
|
||||
'group': 'system',
|
||||
'label': 'Service failed',
|
||||
'group': 'services',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'oom_kill': {
|
||||
'title': '{hostname}: OOM Kill - {process}',
|
||||
'body': '{reason}',
|
||||
'label': 'Out of memory kill',
|
||||
'group': 'services',
|
||||
'default_enabled': True,
|
||||
},
|
||||
|
||||
# ── Hidden internal templates (not shown in UI) ──
|
||||
'service_fail_batch': {
|
||||
'title': '{hostname}: {service_count} services failed',
|
||||
'body': '{reason}',
|
||||
'group': 'system',
|
||||
'label': 'Service fail batch',
|
||||
'group': 'services',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'system_mail': {
|
||||
'title': '{hostname}: {pve_title}',
|
||||
'body': '{reason}',
|
||||
'group': 'system',
|
||||
'label': 'PVE system mail',
|
||||
'group': 'other',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'webhook_test': {
|
||||
'title': '{hostname}: Webhook test received',
|
||||
'body': 'PVE webhook connectivity test successful.\n{reason}',
|
||||
'label': 'Webhook test',
|
||||
'group': 'other',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'update_available': {
|
||||
'title': '{hostname}: Updates available',
|
||||
'body': 'Total updates: {total_count}\nSecurity: {security_count}\nProxmox: {pve_count}\nKernel: {kernel_count}\nImportant: {important_list}',
|
||||
'group': 'system',
|
||||
'default_enabled': False, # Superseded by update_summary
|
||||
},
|
||||
'update_complete': {
|
||||
'title': '{hostname}: Update completed',
|
||||
'body': '{details}',
|
||||
'group': 'system',
|
||||
'label': 'Updates available (legacy)',
|
||||
'group': 'updates',
|
||||
'default_enabled': False,
|
||||
'hidden': True,
|
||||
},
|
||||
|
||||
# ── Unknown persistent (from health monitor) ──
|
||||
'unknown_persistent': {
|
||||
'title': '{hostname}: Check unavailable - {category}',
|
||||
'body': 'Health check for {category} has been unavailable for 3+ cycles.\n{reason}',
|
||||
'group': 'system',
|
||||
'label': 'Check unavailable',
|
||||
'group': 'health',
|
||||
'default_enabled': False,
|
||||
'hidden': True,
|
||||
},
|
||||
|
||||
# ── Persistent Health Issues (daily digest) ──
|
||||
# ── Health Monitor events ──
|
||||
'health_persistent': {
|
||||
'title': '{hostname}: {count} active health issue(s)',
|
||||
'body': 'The following health issues remain active:\n{issue_list}\n\nThis digest is sent once every 24 hours while issues persist.',
|
||||
'group': 'system',
|
||||
'label': 'Active health issues (daily)',
|
||||
'group': 'health',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'health_issue_new': {
|
||||
'title': '{hostname}: New health issue - {category}',
|
||||
'body': 'New {severity} issue detected:\n{reason}',
|
||||
'group': 'system',
|
||||
'label': 'New health issue',
|
||||
'group': 'health',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'health_issue_resolved': {
|
||||
'title': '{hostname}: Resolved - {category}',
|
||||
'body': '{category} issue has been resolved.\n{reason}\nDuration: {duration}',
|
||||
'group': 'system',
|
||||
'label': 'Health issue resolved',
|
||||
'group': 'health',
|
||||
'default_enabled': True,
|
||||
},
|
||||
|
||||
# ── Update notifications (enriched) ──
|
||||
# ── Update notifications ──
|
||||
'update_summary': {
|
||||
'title': '{hostname}: Updates available',
|
||||
'body': (
|
||||
@@ -680,80 +751,99 @@ TEMPLATES = {
|
||||
'Kernel updates: {kernel_count}\n'
|
||||
'Important packages: {important_list}'
|
||||
),
|
||||
'group': 'system',
|
||||
'label': 'Updates available',
|
||||
'group': 'updates',
|
||||
'default_enabled': True,
|
||||
},
|
||||
'pve_update': {
|
||||
'title': '{hostname}: Proxmox VE {new_version} available',
|
||||
'body': 'Proxmox VE {current_version} -> {new_version}\n{details}',
|
||||
'group': 'system',
|
||||
'label': 'Proxmox VE update available',
|
||||
'group': 'updates',
|
||||
'default_enabled': True,
|
||||
},
|
||||
|
||||
# ── PVE webhook test ──
|
||||
'webhook_test': {
|
||||
'title': '{hostname}: Webhook test received',
|
||||
'body': 'PVE webhook connectivity test successful.\n{reason}',
|
||||
'group': 'system',
|
||||
'default_enabled': True,
|
||||
'update_complete': {
|
||||
'title': '{hostname}: Update completed',
|
||||
'body': '{details}',
|
||||
'label': 'Update completed',
|
||||
'group': 'updates',
|
||||
'default_enabled': False,
|
||||
},
|
||||
|
||||
# ── Burst aggregation summaries ──
|
||||
# ── Burst aggregation summaries (hidden -- auto-generated by BurstAggregator) ──
|
||||
# These inherit enabled state from their parent event type at dispatch time.
|
||||
'burst_auth_fail': {
|
||||
'title': '{hostname}: {count} auth failures in {window}',
|
||||
'body': '{count} authentication failures detected in {window}.\nSources: {entity_list}',
|
||||
'label': 'Auth failures burst',
|
||||
'group': 'security',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'burst_ip_block': {
|
||||
'title': '{hostname}: Fail2Ban banned {count} IPs in {window}',
|
||||
'body': '{count} IPs banned by Fail2Ban in {window}.\nIPs: {entity_list}',
|
||||
'label': 'IP block burst',
|
||||
'group': 'security',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'burst_disk_io': {
|
||||
'title': '{hostname}: {count} disk I/O errors on {entity_list}',
|
||||
'body': '{count} I/O errors detected in {window}.\nDevices: {entity_list}',
|
||||
'label': 'Disk I/O burst',
|
||||
'group': 'storage',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'burst_cluster': {
|
||||
'title': '{hostname}: Cluster flapping detected ({count} changes)',
|
||||
'body': 'Cluster state changed {count} times in {window}.\nNodes: {entity_list}',
|
||||
'label': 'Cluster flapping burst',
|
||||
'group': 'cluster',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'burst_service_fail': {
|
||||
'title': '{hostname}: {count} services failed in {window}',
|
||||
'body': '{count} service failures detected in {window}.\nThis typically indicates a node reboot or PVE service restart.\n\nAdditional failures:\n{details}',
|
||||
'group': 'system',
|
||||
'label': 'Service fail burst',
|
||||
'group': 'services',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'burst_system': {
|
||||
'title': '{hostname}: {count} system problems in {window}',
|
||||
'body': '{count} system problems detected in {window}.\n\nAdditional issues:\n{details}',
|
||||
'group': 'system',
|
||||
'label': 'System problems burst',
|
||||
'group': 'services',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'burst_generic': {
|
||||
'title': '{hostname}: {count} {event_type} events in {window}',
|
||||
'body': '{count} events of type {event_type} in {window}.\n\nAdditional events:\n{details}',
|
||||
'group': 'system',
|
||||
'label': 'Generic burst',
|
||||
'group': 'other',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
}
|
||||
|
||||
# ─── Event Groups (for UI filtering) ─────────────────────────────
|
||||
|
||||
EVENT_GROUPS = {
|
||||
'system': {'label': 'System', 'description': 'System health, services, updates'},
|
||||
'vm_ct': {'label': 'VM / CT', 'description': 'Virtual machines and containers'},
|
||||
'backup': {'label': 'Backup', 'description': 'Backups and snapshots'},
|
||||
'resources': {'label': 'Resources', 'description': 'CPU, memory, temperature, load'},
|
||||
'storage': {'label': 'Storage', 'description': 'Disk space and I/O'},
|
||||
'network': {'label': 'Network', 'description': 'Connectivity and latency'},
|
||||
'security': {'label': 'Security', 'description': 'Authentication, firewall, bans'},
|
||||
'cluster': {'label': 'Cluster', 'description': 'Cluster health and quorum'},
|
||||
'vm_ct': {'label': 'VM / CT', 'description': 'Start, stop, crash, migration'},
|
||||
'backup': {'label': 'Backups', 'description': 'Backup start, complete, fail'},
|
||||
'resources': {'label': 'Resources', 'description': 'CPU, memory, temperature'},
|
||||
'storage': {'label': 'Storage', 'description': 'Disk space, I/O, SMART'},
|
||||
'network': {'label': 'Network', 'description': 'Connectivity, bond, latency'},
|
||||
'security': {'label': 'Security', 'description': 'Auth failures, Fail2Ban, firewall'},
|
||||
'cluster': {'label': 'Cluster', 'description': 'Quorum, split-brain, HA fencing'},
|
||||
'services': {'label': 'Services', 'description': 'System services, shutdown, reboot'},
|
||||
'health': {'label': 'Health Monitor', 'description': 'Health checks, degradation, recovery'},
|
||||
'updates': {'label': 'Updates', 'description': 'System and PVE updates'},
|
||||
'other': {'label': 'Other', 'description': 'Uncategorized notifications'},
|
||||
}
|
||||
|
||||
|
||||
@@ -777,14 +867,16 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
|
||||
template = TEMPLATES.get(event_type)
|
||||
if not template:
|
||||
# Catch-all: unknown event types always get delivered (group 'other')
|
||||
# so no Proxmox notification is ever silently dropped.
|
||||
fallback_body = data.get('message', data.get('reason', str(data)))
|
||||
severity = data.get('severity', 'INFO')
|
||||
return {
|
||||
'title': f"{_get_hostname()}: {event_type}",
|
||||
'body': fallback_body, 'body_text': fallback_body,
|
||||
'body_html': f'<p>{html_mod.escape(str(fallback_body))}</p>',
|
||||
'fields': [], 'tags': [severity, 'system', event_type],
|
||||
'severity': severity, 'group': 'system',
|
||||
'fields': [], 'tags': [severity, 'other', event_type],
|
||||
'severity': severity, 'group': 'other',
|
||||
}
|
||||
|
||||
# Ensure hostname is always available
|
||||
@@ -883,24 +975,36 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def get_event_types_by_group() -> Dict[str, list]:
|
||||
"""Get all event types organized by group, for UI rendering.
|
||||
|
||||
Hidden templates (burst aggregations, internal types) are excluded
|
||||
from the UI. They still work in the backend and inherit enabled
|
||||
state from their parent event type.
|
||||
|
||||
Returns:
|
||||
{group_key: [{'type': event_type, 'title': template_title,
|
||||
{group_key: [{'type': event_type, 'title': label,
|
||||
'default_enabled': bool}, ...]}
|
||||
"""
|
||||
result = {}
|
||||
for event_type, template in TEMPLATES.items():
|
||||
group = template.get('group', 'system')
|
||||
# Skip hidden templates (bursts, internal, deprecated)
|
||||
if template.get('hidden', False):
|
||||
continue
|
||||
|
||||
group = template.get('group', 'other')
|
||||
if group not in result:
|
||||
result[group] = []
|
||||
import re
|
||||
# Clean title: remove {hostname}: prefix and any remaining {placeholders}
|
||||
title = template['title'].replace('{hostname}', '').strip(': ')
|
||||
title = re.sub(r'\s*\{[^}]+\}', '', title).strip(' -:')
|
||||
if not title:
|
||||
title = event_type.replace('_', ' ').title()
|
||||
|
||||
# Use explicit label if available, otherwise derive from title
|
||||
label = template.get('label', '')
|
||||
if not label:
|
||||
import re
|
||||
label = template['title'].replace('{hostname}', '').strip(': ')
|
||||
label = re.sub(r'\s*\{[^}]+\}', '', label).strip(' -:')
|
||||
if not label:
|
||||
label = event_type.replace('_', ' ').title()
|
||||
|
||||
result[group].append({
|
||||
'type': event_type,
|
||||
'title': title,
|
||||
'title': label,
|
||||
'default_enabled': template.get('default_enabled', True),
|
||||
})
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user