diff --git a/AppImage/components/notification-settings.tsx b/AppImage/components/notification-settings.tsx index 6d720da3..99e6f4d2 100644 --- a/AppImage/components/notification-settings.tsx +++ b/AppImage/components/notification-settings.tsx @@ -40,13 +40,18 @@ interface EventTypeInfo { default_enabled: boolean } +interface ChannelOverrides { + categories: Record + events: Record +} + interface NotificationConfig { enabled: boolean channels: Record - severity_filter: string event_categories: Record event_toggles: Record event_types_by_group: Record + channel_overrides: Record ai_enabled: boolean ai_provider: string ai_api_key: string @@ -79,23 +84,22 @@ interface HistoryEntry { error_message: string | null } -const SEVERITY_OPTIONS = [ - { value: "critical", label: "Critical only" }, - { value: "warning", label: "Warning + Critical" }, - { value: "info", label: "All (Info + Warning + Critical)" }, -] - const EVENT_CATEGORIES = [ - { key: "system", label: "System", desc: "Startup, shutdown, kernel events" }, { key: "vm_ct", label: "VM / CT", desc: "Start, stop, crash, migration" }, { key: "backup", label: "Backups", desc: "Backup start, complete, fail" }, { key: "resources", label: "Resources", desc: "CPU, memory, temperature" }, - { key: "storage", label: "Storage", desc: "Disk space, I/O errors, SMART" }, + { key: "storage", label: "Storage", desc: "Disk space, I/O, SMART" }, { key: "network", label: "Network", desc: "Connectivity, bond, latency" }, - { key: "security", label: "Security", desc: "Auth failures, fail2ban, firewall" }, + { key: "security", label: "Security", desc: "Auth failures, Fail2Ban, firewall" }, { key: "cluster", label: "Cluster", desc: "Quorum, split-brain, HA fencing" }, + { key: "services", label: "Services", desc: "System services, shutdown, reboot" }, + { key: "health", label: "Health Monitor", desc: "Health checks, degradation, recovery" }, + { key: "updates", label: "Updates", desc: "System and PVE updates" }, + { key: "other", label: "Other", desc: "Uncategorized notifications" }, ] +const CHANNEL_TYPES = ["telegram", "gotify", "discord", "email"] as const + const AI_PROVIDERS = [ { value: "openai", label: "OpenAI" }, { value: "groq", label: "Groq" }, @@ -109,13 +113,19 @@ const DEFAULT_CONFIG: NotificationConfig = { discord: { enabled: false }, email: { enabled: false }, }, - severity_filter: "all", event_categories: { - system: true, vm_ct: true, backup: true, resources: true, - storage: true, network: true, security: true, cluster: true, + vm_ct: true, backup: true, resources: true, storage: true, + network: true, security: true, cluster: true, services: true, + health: true, updates: true, other: true, }, event_toggles: {}, event_types_by_group: {}, + channel_overrides: { + telegram: { categories: {}, events: {} }, + gotify: { categories: {}, events: {} }, + discord: { categories: {}, events: {} }, + email: { categories: {}, events: {} }, + }, ai_enabled: false, ai_provider: "openai", ai_api_key: "", @@ -217,7 +227,6 @@ export function NotificationSettings() { const flattenConfig = (cfg: NotificationConfig): Record => { const flat: Record = { enabled: String(cfg.enabled), - severity_filter: cfg.severity_filter, ai_enabled: String(cfg.ai_enabled), ai_provider: cfg.ai_provider, ai_api_key: cfg.ai_api_key, @@ -235,20 +244,17 @@ export function NotificationSettings() { flat[`${chName}.${field}`] = String(value ?? "") } } - // Flatten event_categories: { system: true, backups: false } -> events.system, events.backups + // Flatten global event_categories: { vm_ct: true, backup: false } -> events.vm_ct, events.backup for (const [cat, enabled] of Object.entries(cfg.event_categories)) { flat[`events.${cat}`] = String(enabled) } - // Flatten event_toggles: { vm_start: true, vm_stop: false } -> event.vm_start, event.vm_stop - // Always write ALL toggles to DB so the backend has an explicit record. - // This ensures default_enabled changes in templates don't get overridden by stale DB values. + // Flatten global event_toggles: { vm_start: true } -> event.vm_start if (cfg.event_toggles) { for (const [evt, enabled] of Object.entries(cfg.event_toggles)) { flat[`event.${evt}`] = String(enabled) } } - // Also write any events NOT in event_toggles using their template defaults. - // This covers newly added templates whose default_enabled may be false. + // Write defaults for events NOT in toggles if (cfg.event_types_by_group) { for (const events of Object.values(cfg.event_types_by_group)) { for (const evt of (events as Array<{type: string, default_enabled: boolean}>)) { @@ -259,6 +265,21 @@ export function NotificationSettings() { } } } + // Flatten per-channel overrides: telegram.events.backup, telegram.event.vm_start, etc. + if (cfg.channel_overrides) { + for (const [chName, overrides] of Object.entries(cfg.channel_overrides)) { + if (overrides.categories) { + for (const [cat, enabled] of Object.entries(overrides.categories)) { + flat[`${chName}.events.${cat}`] = String(enabled) + } + } + if (overrides.events) { + for (const [evt, enabled] of Object.entries(overrides.events)) { + flat[`${chName}.event.${evt}`] = String(enabled) + } + } + } + } return flat } @@ -1052,27 +1073,8 @@ matcher: proxmenux-pbs Filters & Events
- {/* Severity */} + {/* Event Categories (global defaults -- per-channel overrides in Channel Filters below) */}
- - -
- - {/* Event Categories */} -
{EVENT_CATEGORIES.map(cat => { @@ -1198,6 +1200,118 @@ matcher: proxmenux-pbs })}
+ + {/* Per-channel overrides */} +
+ +

+ By default every channel inherits the global settings above. Override specific categories per channel to customize what each destination receives. +

+
+ {CHANNEL_TYPES.map(chName => { + const chEnabled = config.channels[chName]?.enabled + if (!chEnabled) return null + const overrides = config.channel_overrides?.[chName] || { categories: {}, events: {} } + const hasOverrides = Object.keys(overrides.categories).length > 0 + const chLabel = chName === "email" ? "Email" : chName.charAt(0).toUpperCase() + chName.slice(1) + const chColor = chName === "telegram" ? "blue" : chName === "gotify" ? "green" : chName === "discord" ? "indigo" : "amber" + + return ( +
+ +
+ + {chLabel} + {hasOverrides && ( + + customized + + )} +
+ {!hasOverrides && ( + inherits global + )} +
+
+ {EVENT_CATEGORIES.map(cat => { + const globalEnabled = config.event_categories[cat.key] ?? true + const override = overrides.categories[cat.key] + const isCustomized = override !== undefined + const effectiveEnabled = isCustomized ? override : globalEnabled + + return ( +
+
+ + {cat.label} + + {!isCustomized && ( + global + )} +
+
+ {isCustomized && ( + + )} + +
+
+ ) + })} +
+
+ ) + })} + {CHANNEL_TYPES.every(ch => !config.channels[ch]?.enabled) && ( +

+ Enable at least one channel above to configure per-channel filters. +

+ )} +
+
+
{/* close bordered filters container */} diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index 747db50d..3bab0fd1 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -575,13 +575,31 @@ def _temperature_collector_loop(): def _health_collector_loop(): """Background thread: run full health checks every 5 minutes. - Keeps the health cache always fresh and records events/errors in the DB - so the future notification service can consume them.""" + Keeps the health cache always fresh and records events/errors in the DB. + Also emits notifications when a health category degrades (OK -> WARNING/CRITICAL).""" from health_monitor import health_monitor # Wait 30s after startup to let other services initialize time.sleep(30) + # Track previous status per category to detect transitions + _prev_statuses = {} + # Severity ranking for comparison + _SEV_RANK = {'OK': 0, 'INFO': 0, 'UNKNOWN': 1, 'WARNING': 2, 'CRITICAL': 3} + # Human-readable category names + _CAT_NAMES = { + 'cpu': 'CPU Usage & Temperature', + 'memory': 'Memory & Swap', + 'storage': 'Storage Mounts & Space', + 'disks': 'Disk I/O & Errors', + 'network': 'Network Interfaces', + 'vms': 'VMs & Containers', + 'services': 'PVE Services', + 'logs': 'System Logs', + 'updates': 'System Updates', + 'security': 'Security', + } + while True: try: # Run full health check (results get cached internally + recorded in DB) @@ -598,6 +616,64 @@ def _health_collector_loop(): health_monitor.cached_results['_bg_detailed'] = result health_monitor.last_check_times['_bg_overall'] = time.time() health_monitor.last_check_times['_bg_detailed'] = time.time() + + # ── Health degradation notifications ── + # Compare each category's current status to previous cycle. + # Notify when a category DEGRADES (OK->WARNING, WARNING->CRITICAL, etc.) + # Include the detailed 'reason' so the user knows exactly what triggered it. + details = result.get('details', {}) + degraded = [] + + for cat_key, cat_data in details.items(): + cur_status = cat_data.get('status', 'OK') + prev_status = _prev_statuses.get(cat_key, 'OK') + cur_rank = _SEV_RANK.get(cur_status, 0) + prev_rank = _SEV_RANK.get(prev_status, 0) + + if cur_rank > prev_rank and cur_rank >= 2: # WARNING or CRITICAL + reason = cat_data.get('reason', f'{cat_key} status changed to {cur_status}') + cat_name = _CAT_NAMES.get(cat_key, cat_key) + degraded.append({ + 'category': cat_name, + 'status': cur_status, + 'reason': reason, + }) + + _prev_statuses[cat_key] = cur_status + + # Send grouped notification if any categories degraded + if degraded and notification_manager._enabled: + hostname = result.get('hostname', '') + if not hostname: + import socket as _sock + hostname = _sock.gethostname() + + if len(degraded) == 1: + d = degraded[0] + title = f"{hostname}: Health {d['status']} - {d['category']}" + body = d['reason'] + severity = d['status'] + else: + # Multiple categories degraded at once -- group them + max_sev = max(degraded, key=lambda x: _SEV_RANK.get(x['status'], 0))['status'] + title = f"{hostname}: {len(degraded)} health checks degraded" + lines = [] + for d in degraded: + lines.append(f" [{d['status']}] {d['category']}: {d['reason']}") + body = '\n'.join(lines) + severity = max_sev + + try: + notification_manager.send_notification( + event_type='health_degraded', + severity=severity, + title=title, + message=body, + data={'hostname': hostname, 'count': str(len(degraded))}, + source='health_monitor', + ) + except Exception as e: + print(f"[ProxMenux] Health notification error: {e}") except Exception as e: print(f"[ProxMenux] Health collector error: {e}") diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index cbb31b23..5bc5de4a 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -2778,24 +2778,41 @@ class HealthMonitor: return 'INFO' return severity + # Build detail strings that include the actual error samples + # so the user can see exactly WHAT is triggering the warning. + if cascade_count > 0: + cascade_detail = f'{cascade_count} pattern(s) repeating >=15 times: ' + '; '.join(cascade_samples) + else: + cascade_detail = 'No cascading errors' + + if spike_count > 0: + spike_detail = f'{spike_count} pattern(s) with 4x increase: ' + '; '.join(spike_samples) + else: + spike_detail = 'No error spikes' + + if persistent_count > 0: + persist_detail = f'{persistent_count} recurring pattern(s) over 15+ min: ' + '; '.join(persist_samples) + else: + persist_detail = 'No persistent patterns' + log_checks = { 'log_error_cascade': { 'status': _log_check_status('log_error_cascade', cascade_count > 0, 'WARNING'), - 'detail': f'{cascade_count} pattern(s) repeating >=15 times' if cascade_count > 0 else 'No cascading errors', + 'detail': cascade_detail, 'dismissable': True, 'dismissed': 'log_error_cascade' in dismissed_keys, 'error_key': 'log_error_cascade' }, 'log_error_spike': { 'status': _log_check_status('log_error_spike', spike_count > 0, 'WARNING'), - 'detail': f'{spike_count} pattern(s) with 4x increase' if spike_count > 0 else 'No error spikes', + 'detail': spike_detail, 'dismissable': True, 'dismissed': 'log_error_spike' in dismissed_keys, 'error_key': 'log_error_spike' }, 'log_persistent_errors': { 'status': _log_check_status('log_persistent_errors', persistent_count > 0, 'WARNING'), - 'detail': f'{persistent_count} recurring pattern(s) over 15+ min' if persistent_count > 0 else 'No persistent patterns', + 'detail': persist_detail, 'dismissable': True, 'dismissed': 'log_persistent_errors' in dismissed_keys, 'error_key': 'log_persistent_errors' diff --git a/AppImage/scripts/notification_manager.py b/AppImage/scripts/notification_manager.py index 376a9080..ee9ec3b2 100644 --- a/AppImage/scripts/notification_manager.py +++ b/AppImage/scripts/notification_manager.py @@ -69,9 +69,15 @@ GROUP_RATE_LIMITS = { 'resources': {'max_per_minute': 3, 'max_per_hour': 20}, 'vm_ct': {'max_per_minute': 10, 'max_per_hour': 60}, 'backup': {'max_per_minute': 5, 'max_per_hour': 30}, - 'system': {'max_per_minute': 5, 'max_per_hour': 30}, + 'services': {'max_per_minute': 5, 'max_per_hour': 30}, + 'health': {'max_per_minute': 3, 'max_per_hour': 20}, + 'updates': {'max_per_minute': 3, 'max_per_hour': 15}, + 'other': {'max_per_minute': 5, 'max_per_hour': 30}, } +# Default fallback for unknown groups +_DEFAULT_RATE_LIMIT = {'max_per_minute': 5, 'max_per_hour': 30} + class GroupRateLimiter: """Rate limiter per event group. Prevents notification storms.""" @@ -84,7 +90,7 @@ class GroupRateLimiter: def allow(self, group: str) -> bool: """Check if group rate limit allows this event.""" - limits = GROUP_RATE_LIMITS.get(group, GROUP_RATE_LIMITS['system']) + limits = GROUP_RATE_LIMITS.get(group, _DEFAULT_RATE_LIMIT) now = time.time() # Initialize if needed @@ -554,35 +560,28 @@ class NotificationManager: print(f"[NotificationManager] Aggregation flush error: {e}") def _process_event(self, event: NotificationEvent): - """Process a single event: filter -> aggregate -> cooldown -> rate limit -> dispatch.""" + """Process a single event: filter -> aggregate -> cooldown -> rate limit -> dispatch. + + NOTE: Group and per-event filters are checked globally here. + Per-channel overrides are applied later in _dispatch_to_channels(). + """ if not self._enabled: return - # Check if this event's GROUP is enabled in settings. - # The UI saves categories by group key: events.vm_ct, events.backup, etc. + # Check if this event's GROUP is enabled globally. template = TEMPLATES.get(event.event_type, {}) - event_group = template.get('group', 'system') + event_group = template.get('group', 'other') group_setting = f'events.{event_group}' if self._config.get(group_setting, 'true') == 'false': return - # Check if this SPECIFIC event type is enabled (granular per-event toggle). - # Key format: event.{event_type} = "true"/"false" + # Check if this SPECIFIC event type is enabled globally. # Default comes from the template's default_enabled field. default_enabled = 'true' if template.get('default_enabled', True) else 'false' event_specific = f'event.{event.event_type}' if self._config.get(event_specific, default_enabled) == 'false': return - # Check severity filter. - # The UI saves severity_filter as: "all", "warning", "critical". - # Map to our internal severity names for comparison. - severity_map = {'all': 'INFO', 'warning': 'WARNING', 'critical': 'CRITICAL'} - raw_filter = self._config.get('severity_filter', 'all') - min_severity = severity_map.get(raw_filter.lower(), 'INFO') - if not self._meets_severity(event.severity, min_severity): - return - # Try aggregation (may buffer the event) result = self._aggregator.ingest(event) if result is None: @@ -593,30 +592,23 @@ class NotificationManager: self._dispatch_event(event) def _process_event_direct(self, event: NotificationEvent): - """Process a burst summary event. Bypasses aggregator but applies ALL other filters.""" + """Process a burst summary event. Bypasses aggregator but applies global filters.""" if not self._enabled: return - # Check group filter (same as _process_event) + # Check group filter template = TEMPLATES.get(event.event_type, {}) - event_group = template.get('group', 'system') + event_group = template.get('group', 'other') group_setting = f'events.{event_group}' if self._config.get(group_setting, 'true') == 'false': return - # Check per-event filter (same as _process_event) + # Check per-event filter default_enabled = 'true' if template.get('default_enabled', True) else 'false' event_specific = f'event.{event.event_type}' if self._config.get(event_specific, default_enabled) == 'false': return - # Check severity filter (same mapping as _process_event) - severity_map = {'all': 'INFO', 'warning': 'WARNING', 'critical': 'CRITICAL'} - raw_filter = self._config.get('severity_filter', 'all') - min_severity = severity_map.get(raw_filter.lower(), 'INFO') - if not self._meets_severity(event.severity, min_severity): - return - self._dispatch_event(event) def _dispatch_event(self, event: NotificationEvent): @@ -636,7 +628,7 @@ class NotificationManager: # Check group rate limit template = TEMPLATES.get(event.event_type, {}) - group = template.get('group', 'system') + group = template.get('group', 'other') if not self._group_limiter.allow(group): return @@ -674,11 +666,33 @@ class NotificationManager: def _dispatch_to_channels(self, title: str, body: str, severity: str, event_type: str, data: Dict, source: str): - """Send notification through all configured channels.""" + """Send notification through configured channels, respecting per-channel overrides. + + Each channel can override global category/event settings: + - {channel}.events.{group} = "true"/"false" (category override) + - {channel}.event.{type} = "true"/"false" (per-event override) + If no override exists, the channel inherits the global setting (already checked). + """ with self._lock: channels = dict(self._channels) + template = TEMPLATES.get(event_type, {}) + event_group = template.get('group', 'other') + for ch_name, channel in channels.items(): + # ── Per-channel override check ── + # If the channel has an explicit override for this group or event, respect it. + # If no override, the global filter already passed (checked in _process_event). + ch_group_key = f'{ch_name}.events.{event_group}' + ch_group_override = self._config.get(ch_group_key) + if ch_group_override == 'false': + continue # Channel explicitly disabled this category + + ch_event_key = f'{ch_name}.event.{event_type}' + ch_event_override = self._config.get(ch_event_key) + if ch_event_override == 'false': + continue # Channel explicitly disabled this event + try: result = channel.send(title, body, severity, data) self._record_history( @@ -857,12 +871,6 @@ class NotificationManager: except Exception: pass - @staticmethod - def _meets_severity(event_severity: str, min_severity: str) -> bool: - """Check if event severity meets the minimum threshold.""" - levels = {'INFO': 0, 'WARNING': 1, 'CRITICAL': 2} - return levels.get(event_severity, 0) >= levels.get(min_severity, 0) - # ─── History Recording ────────────────────────────────────── def _record_history(self, event_type: str, channel: str, title: str, @@ -1171,7 +1179,7 @@ class NotificationManager: channels[ch_type] = ch_cfg # Build event_categories dict (group-level toggle) - # EVENT_GROUPS is a dict: { 'system': {...}, 'vm_ct': {...}, ... } + # EVENT_GROUPS is a dict: { 'vm_ct': {...}, 'services': {...}, 'health': {...}, ... } event_categories = {} for group_key in EVENT_GROUPS: event_categories[group_key] = self._config.get(f'events.{group_key}', 'true') == 'true' @@ -1189,13 +1197,28 @@ class NotificationManager: # Build event_types_by_group for UI rendering event_types_by_group = get_event_types_by_group() + # Build per-channel overrides + # Keys: {channel}.events.{group} and {channel}.event.{event_type} + channel_overrides = {} + for ch_type in CHANNEL_TYPES: + ch_overrides = {'categories': {}, 'events': {}} + for group_key in EVENT_GROUPS: + val = self._config.get(f'{ch_type}.events.{group_key}') + if val is not None: + ch_overrides['categories'][group_key] = val == 'true' + for event_type_key in TEMPLATES: + val = self._config.get(f'{ch_type}.event.{event_type_key}') + if val is not None: + ch_overrides['events'][event_type_key] = val == 'true' + channel_overrides[ch_type] = ch_overrides + config = { 'enabled': self._enabled, 'channels': channels, - 'severity_filter': self._config.get('severity_filter', 'all'), 'event_categories': event_categories, 'event_toggles': event_toggles, 'event_types_by_group': event_types_by_group, + 'channel_overrides': channel_overrides, 'ai_enabled': self._config.get('ai_enabled', 'false') == 'true', 'ai_provider': self._config.get('ai_provider', 'openai'), 'ai_api_key': self._config.get('ai_api_key', ''), diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py index bdd5c682..9711739b 100644 --- a/AppImage/scripts/notification_templates.py +++ b/AppImage/scripts/notification_templates.py @@ -342,25 +342,36 @@ TEMPLATES = { 'state_change': { 'title': '{hostname}: {category} changed to {current}', 'body': '{category} status changed from {previous} to {current}.\n{reason}', - 'group': 'system', + 'label': 'Health state changed', + 'group': 'health', 'default_enabled': False, }, 'new_error': { 'title': '{hostname}: New {severity} - {category}', 'body': '{reason}', - 'group': 'system', + 'label': 'New health issue', + 'group': 'health', 'default_enabled': True, }, 'error_resolved': { 'title': '{hostname}: Resolved - {category}', 'body': '{reason}\nDuration: {duration}', - 'group': 'system', + 'label': 'Health issue resolved', + 'group': 'health', 'default_enabled': True, }, 'error_escalated': { 'title': '{hostname}: Escalated to {severity} - {category}', 'body': '{reason}', - 'group': 'system', + 'label': 'Health issue escalated', + 'group': 'health', + 'default_enabled': True, + }, + 'health_degraded': { + 'title': '{hostname}: Health check degraded', + 'body': '{reason}', + 'label': 'Health check degraded', + 'group': 'health', 'default_enabled': True, }, @@ -368,90 +379,105 @@ TEMPLATES = { 'vm_start': { 'title': '{hostname}: VM {vmid} started', 'body': '{vmname} ({vmid}) has been started.', + 'label': 'VM started', 'group': 'vm_ct', 'default_enabled': True, }, 'vm_stop': { 'title': '{hostname}: VM {vmid} stopped', 'body': '{vmname} ({vmid}) has been stopped.', + 'label': 'VM stopped', 'group': 'vm_ct', 'default_enabled': False, }, 'vm_shutdown': { 'title': '{hostname}: VM {vmid} shutdown', 'body': '{vmname} ({vmid}) has been shut down.', + 'label': 'VM shutdown', 'group': 'vm_ct', 'default_enabled': False, }, 'vm_fail': { 'title': '{hostname}: VM {vmid} FAILED', 'body': '{vmname} ({vmid}) has failed.\n{reason}', + 'label': 'VM FAILED', 'group': 'vm_ct', 'default_enabled': True, }, 'vm_restart': { 'title': '{hostname}: VM {vmid} restarted', 'body': '{vmname} ({vmid}) has been restarted.', + 'label': 'VM restarted', 'group': 'vm_ct', 'default_enabled': False, }, 'ct_start': { 'title': '{hostname}: CT {vmid} started', 'body': '{vmname} ({vmid}) has been started.', + 'label': 'CT started', 'group': 'vm_ct', 'default_enabled': True, }, 'ct_stop': { 'title': '{hostname}: CT {vmid} stopped', 'body': '{vmname} ({vmid}) has been stopped.', + 'label': 'CT stopped', 'group': 'vm_ct', 'default_enabled': False, }, 'ct_shutdown': { 'title': '{hostname}: CT {vmid} shutdown', 'body': '{vmname} ({vmid}) has been shut down.', + 'label': 'CT shutdown', 'group': 'vm_ct', 'default_enabled': False, }, 'ct_restart': { 'title': '{hostname}: CT {vmid} restarted', 'body': '{vmname} ({vmid}) has been restarted.', + 'label': 'CT restarted', 'group': 'vm_ct', 'default_enabled': False, }, 'ct_fail': { 'title': '{hostname}: CT {vmid} FAILED', 'body': '{vmname} ({vmid}) has failed.\n{reason}', + 'label': 'CT FAILED', 'group': 'vm_ct', 'default_enabled': True, }, 'migration_start': { 'title': '{hostname}: Migration started - {vmid}', 'body': '{vmname} ({vmid}) migration to {target_node} started.', + 'label': 'Migration started', 'group': 'vm_ct', 'default_enabled': True, }, 'migration_complete': { 'title': '{hostname}: Migration complete - {vmid}', 'body': '{vmname} ({vmid}) migrated successfully to {target_node}.', + 'label': 'Migration complete', 'group': 'vm_ct', 'default_enabled': True, }, 'migration_fail': { 'title': '{hostname}: Migration FAILED - {vmid}', 'body': '{vmname} ({vmid}) migration to {target_node} failed.\n{reason}', + 'label': 'Migration FAILED', 'group': 'vm_ct', 'default_enabled': True, }, 'replication_fail': { 'title': '{hostname}: Replication FAILED - {vmid}', 'body': 'Replication of {vmname} ({vmid}) has failed.\n{reason}', + 'label': 'Replication FAILED', 'group': 'vm_ct', 'default_enabled': True, }, 'replication_complete': { 'title': '{hostname}: Replication complete - {vmid}', 'body': 'Replication of {vmname} ({vmid}) completed successfully.', + 'label': 'Replication complete', 'group': 'vm_ct', 'default_enabled': False, }, @@ -460,30 +486,35 @@ TEMPLATES = { 'backup_start': { 'title': '{hostname}: Backup started', 'body': '{reason}', + 'label': 'Backup started', 'group': 'backup', 'default_enabled': False, }, 'backup_complete': { 'title': '{hostname}: Backup complete - {vmid}', 'body': 'Backup of {vmname} ({vmid}) completed successfully.\nSize: {size}', + 'label': 'Backup complete', 'group': 'backup', 'default_enabled': True, }, 'backup_fail': { 'title': '{hostname}: Backup FAILED - {vmid}', 'body': 'Backup of {vmname} ({vmid}) has failed.\n{reason}', + 'label': 'Backup FAILED', 'group': 'backup', 'default_enabled': True, }, 'snapshot_complete': { 'title': '{hostname}: Snapshot created - {vmid}', 'body': 'Snapshot of {vmname} ({vmid}) created: {snapshot_name}', + 'label': 'Snapshot created', 'group': 'backup', 'default_enabled': False, }, 'snapshot_fail': { 'title': '{hostname}: Snapshot FAILED - {vmid}', 'body': 'Snapshot of {vmname} ({vmid}) failed.\n{reason}', + 'label': 'Snapshot FAILED', 'group': 'backup', 'default_enabled': True, }, @@ -492,42 +523,49 @@ TEMPLATES = { 'cpu_high': { 'title': '{hostname}: High CPU usage ({value}%)', 'body': 'CPU usage is at {value}% on {cores} cores.\n{details}', + 'label': 'High CPU usage', 'group': 'resources', 'default_enabled': True, }, 'ram_high': { 'title': '{hostname}: High memory usage ({value}%)', 'body': 'Memory usage: {used} / {total} ({value}%).\n{details}', + 'label': 'High memory usage', 'group': 'resources', 'default_enabled': True, }, 'temp_high': { 'title': '{hostname}: High temperature ({value}C)', 'body': 'CPU temperature: {value}C (threshold: {threshold}C).\n{details}', + 'label': 'High temperature', 'group': 'resources', 'default_enabled': True, }, 'disk_space_low': { 'title': '{hostname}: Low disk space on {mount}', 'body': '{mount}: {used}% used ({available} available).', + 'label': 'Low disk space', 'group': 'storage', 'default_enabled': True, }, 'disk_io_error': { 'title': '{hostname}: Disk failure detected on {device}', 'body': '{reason}', + 'label': 'Disk failure / I/O error', 'group': 'storage', 'default_enabled': True, }, 'storage_unavailable': { 'title': '{hostname}: Storage unavailable - {storage_name}', 'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}', + 'label': 'Storage unavailable', 'group': 'storage', 'default_enabled': True, }, 'load_high': { 'title': '{hostname}: High system load ({value})', 'body': 'System load average: {value} on {cores} cores.\n{details}', + 'label': 'High system load', 'group': 'resources', 'default_enabled': True, }, @@ -536,12 +574,14 @@ TEMPLATES = { 'network_down': { 'title': '{hostname}: Network connectivity lost', 'body': 'Network connectivity check failed.\n{reason}', + 'label': 'Network connectivity lost', 'group': 'network', 'default_enabled': True, }, 'network_latency': { 'title': '{hostname}: High network latency ({value}ms)', 'body': 'Latency to gateway: {value}ms (threshold: {threshold}ms).', + 'label': 'High network latency', 'group': 'network', 'default_enabled': False, }, @@ -550,24 +590,28 @@ TEMPLATES = { 'auth_fail': { 'title': '{hostname}: Authentication failure', 'body': 'Failed login attempt from {source_ip}.\nUser: {username}\nService: {service}', + 'label': 'Authentication failure', 'group': 'security', 'default_enabled': True, }, 'ip_block': { 'title': '{hostname}: IP blocked by Fail2Ban', 'body': 'IP {source_ip} has been banned.\nJail: {jail}\nFailures: {failures}', + 'label': 'IP blocked by Fail2Ban', 'group': 'security', 'default_enabled': True, }, 'firewall_issue': { 'title': '{hostname}: Firewall issue detected', 'body': '{reason}', + 'label': 'Firewall issue detected', 'group': 'security', 'default_enabled': True, }, 'user_permission_change': { 'title': '{hostname}: User permission changed', 'body': 'User: {username}\nChange: {change_details}', + 'label': 'User permission changed', 'group': 'security', 'default_enabled': True, }, @@ -576,101 +620,128 @@ TEMPLATES = { 'split_brain': { 'title': '{hostname}: SPLIT-BRAIN detected', 'body': 'Cluster split-brain condition detected.\nQuorum status: {quorum}', + 'label': 'SPLIT-BRAIN detected', 'group': 'cluster', 'default_enabled': True, }, 'node_disconnect': { 'title': '{hostname}: Node disconnected', 'body': 'Node {node_name} has disconnected from the cluster.', + 'label': 'Node disconnected', 'group': 'cluster', 'default_enabled': True, }, 'node_reconnect': { 'title': '{hostname}: Node reconnected', 'body': 'Node {node_name} has reconnected to the cluster.', + 'label': 'Node reconnected', 'group': 'cluster', 'default_enabled': True, }, - # ── System events ── + # ── Services events ── 'system_shutdown': { 'title': '{hostname}: System shutting down', 'body': '{reason}', - 'group': 'system', + 'label': 'System shutting down', + 'group': 'services', 'default_enabled': True, }, 'system_reboot': { 'title': '{hostname}: System rebooting', 'body': '{reason}', - 'group': 'system', + 'label': 'System rebooting', + 'group': 'services', 'default_enabled': True, }, 'system_problem': { 'title': '{hostname}: System problem detected', 'body': '{reason}', - 'group': 'system', + 'label': 'System problem detected', + 'group': 'services', 'default_enabled': True, }, 'service_fail': { 'title': '{hostname}: Service failed - {service_name}', 'body': '{reason}', - 'group': 'system', + 'label': 'Service failed', + 'group': 'services', 'default_enabled': True, }, + 'oom_kill': { + 'title': '{hostname}: OOM Kill - {process}', + 'body': '{reason}', + 'label': 'Out of memory kill', + 'group': 'services', + 'default_enabled': True, + }, + + # ── Hidden internal templates (not shown in UI) ── 'service_fail_batch': { 'title': '{hostname}: {service_count} services failed', 'body': '{reason}', - 'group': 'system', + 'label': 'Service fail batch', + 'group': 'services', 'default_enabled': True, + 'hidden': True, }, 'system_mail': { 'title': '{hostname}: {pve_title}', 'body': '{reason}', - 'group': 'system', + 'label': 'PVE system mail', + 'group': 'other', 'default_enabled': True, + 'hidden': True, + }, + 'webhook_test': { + 'title': '{hostname}: Webhook test received', + 'body': 'PVE webhook connectivity test successful.\n{reason}', + 'label': 'Webhook test', + 'group': 'other', + 'default_enabled': True, + 'hidden': True, }, 'update_available': { 'title': '{hostname}: Updates available', 'body': 'Total updates: {total_count}\nSecurity: {security_count}\nProxmox: {pve_count}\nKernel: {kernel_count}\nImportant: {important_list}', - 'group': 'system', - 'default_enabled': False, # Superseded by update_summary - }, - 'update_complete': { - 'title': '{hostname}: Update completed', - 'body': '{details}', - 'group': 'system', + 'label': 'Updates available (legacy)', + 'group': 'updates', 'default_enabled': False, + 'hidden': True, }, - - # ── Unknown persistent (from health monitor) ── 'unknown_persistent': { 'title': '{hostname}: Check unavailable - {category}', 'body': 'Health check for {category} has been unavailable for 3+ cycles.\n{reason}', - 'group': 'system', + 'label': 'Check unavailable', + 'group': 'health', 'default_enabled': False, + 'hidden': True, }, - # ── Persistent Health Issues (daily digest) ── + # ── Health Monitor events ── 'health_persistent': { 'title': '{hostname}: {count} active health issue(s)', 'body': 'The following health issues remain active:\n{issue_list}\n\nThis digest is sent once every 24 hours while issues persist.', - 'group': 'system', + 'label': 'Active health issues (daily)', + 'group': 'health', 'default_enabled': True, }, 'health_issue_new': { 'title': '{hostname}: New health issue - {category}', 'body': 'New {severity} issue detected:\n{reason}', - 'group': 'system', + 'label': 'New health issue', + 'group': 'health', 'default_enabled': True, }, 'health_issue_resolved': { 'title': '{hostname}: Resolved - {category}', 'body': '{category} issue has been resolved.\n{reason}\nDuration: {duration}', - 'group': 'system', + 'label': 'Health issue resolved', + 'group': 'health', 'default_enabled': True, }, - # ── Update notifications (enriched) ── + # ── Update notifications ── 'update_summary': { 'title': '{hostname}: Updates available', 'body': ( @@ -680,80 +751,99 @@ TEMPLATES = { 'Kernel updates: {kernel_count}\n' 'Important packages: {important_list}' ), - 'group': 'system', + 'label': 'Updates available', + 'group': 'updates', 'default_enabled': True, }, 'pve_update': { 'title': '{hostname}: Proxmox VE {new_version} available', 'body': 'Proxmox VE {current_version} -> {new_version}\n{details}', - 'group': 'system', + 'label': 'Proxmox VE update available', + 'group': 'updates', 'default_enabled': True, }, - - # ── PVE webhook test ── - 'webhook_test': { - 'title': '{hostname}: Webhook test received', - 'body': 'PVE webhook connectivity test successful.\n{reason}', - 'group': 'system', - 'default_enabled': True, + 'update_complete': { + 'title': '{hostname}: Update completed', + 'body': '{details}', + 'label': 'Update completed', + 'group': 'updates', + 'default_enabled': False, }, - # ── Burst aggregation summaries ── + # ── Burst aggregation summaries (hidden -- auto-generated by BurstAggregator) ── + # These inherit enabled state from their parent event type at dispatch time. 'burst_auth_fail': { 'title': '{hostname}: {count} auth failures in {window}', 'body': '{count} authentication failures detected in {window}.\nSources: {entity_list}', + 'label': 'Auth failures burst', 'group': 'security', 'default_enabled': True, + 'hidden': True, }, 'burst_ip_block': { 'title': '{hostname}: Fail2Ban banned {count} IPs in {window}', 'body': '{count} IPs banned by Fail2Ban in {window}.\nIPs: {entity_list}', + 'label': 'IP block burst', 'group': 'security', 'default_enabled': True, + 'hidden': True, }, 'burst_disk_io': { 'title': '{hostname}: {count} disk I/O errors on {entity_list}', 'body': '{count} I/O errors detected in {window}.\nDevices: {entity_list}', + 'label': 'Disk I/O burst', 'group': 'storage', 'default_enabled': True, + 'hidden': True, }, 'burst_cluster': { 'title': '{hostname}: Cluster flapping detected ({count} changes)', 'body': 'Cluster state changed {count} times in {window}.\nNodes: {entity_list}', + 'label': 'Cluster flapping burst', 'group': 'cluster', 'default_enabled': True, + 'hidden': True, }, 'burst_service_fail': { 'title': '{hostname}: {count} services failed in {window}', 'body': '{count} service failures detected in {window}.\nThis typically indicates a node reboot or PVE service restart.\n\nAdditional failures:\n{details}', - 'group': 'system', + 'label': 'Service fail burst', + 'group': 'services', 'default_enabled': True, + 'hidden': True, }, 'burst_system': { 'title': '{hostname}: {count} system problems in {window}', 'body': '{count} system problems detected in {window}.\n\nAdditional issues:\n{details}', - 'group': 'system', + 'label': 'System problems burst', + 'group': 'services', 'default_enabled': True, + 'hidden': True, }, 'burst_generic': { 'title': '{hostname}: {count} {event_type} events in {window}', 'body': '{count} events of type {event_type} in {window}.\n\nAdditional events:\n{details}', - 'group': 'system', + 'label': 'Generic burst', + 'group': 'other', 'default_enabled': True, + 'hidden': True, }, } # ─── Event Groups (for UI filtering) ───────────────────────────── EVENT_GROUPS = { - 'system': {'label': 'System', 'description': 'System health, services, updates'}, - 'vm_ct': {'label': 'VM / CT', 'description': 'Virtual machines and containers'}, - 'backup': {'label': 'Backup', 'description': 'Backups and snapshots'}, - 'resources': {'label': 'Resources', 'description': 'CPU, memory, temperature, load'}, - 'storage': {'label': 'Storage', 'description': 'Disk space and I/O'}, - 'network': {'label': 'Network', 'description': 'Connectivity and latency'}, - 'security': {'label': 'Security', 'description': 'Authentication, firewall, bans'}, - 'cluster': {'label': 'Cluster', 'description': 'Cluster health and quorum'}, + 'vm_ct': {'label': 'VM / CT', 'description': 'Start, stop, crash, migration'}, + 'backup': {'label': 'Backups', 'description': 'Backup start, complete, fail'}, + 'resources': {'label': 'Resources', 'description': 'CPU, memory, temperature'}, + 'storage': {'label': 'Storage', 'description': 'Disk space, I/O, SMART'}, + 'network': {'label': 'Network', 'description': 'Connectivity, bond, latency'}, + 'security': {'label': 'Security', 'description': 'Auth failures, Fail2Ban, firewall'}, + 'cluster': {'label': 'Cluster', 'description': 'Quorum, split-brain, HA fencing'}, + 'services': {'label': 'Services', 'description': 'System services, shutdown, reboot'}, + 'health': {'label': 'Health Monitor', 'description': 'Health checks, degradation, recovery'}, + 'updates': {'label': 'Updates', 'description': 'System and PVE updates'}, + 'other': {'label': 'Other', 'description': 'Uncategorized notifications'}, } @@ -777,14 +867,16 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]: template = TEMPLATES.get(event_type) if not template: + # Catch-all: unknown event types always get delivered (group 'other') + # so no Proxmox notification is ever silently dropped. fallback_body = data.get('message', data.get('reason', str(data))) severity = data.get('severity', 'INFO') return { 'title': f"{_get_hostname()}: {event_type}", 'body': fallback_body, 'body_text': fallback_body, 'body_html': f'

{html_mod.escape(str(fallback_body))}

', - 'fields': [], 'tags': [severity, 'system', event_type], - 'severity': severity, 'group': 'system', + 'fields': [], 'tags': [severity, 'other', event_type], + 'severity': severity, 'group': 'other', } # Ensure hostname is always available @@ -883,24 +975,36 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]: def get_event_types_by_group() -> Dict[str, list]: """Get all event types organized by group, for UI rendering. + Hidden templates (burst aggregations, internal types) are excluded + from the UI. They still work in the backend and inherit enabled + state from their parent event type. + Returns: - {group_key: [{'type': event_type, 'title': template_title, + {group_key: [{'type': event_type, 'title': label, 'default_enabled': bool}, ...]} """ result = {} for event_type, template in TEMPLATES.items(): - group = template.get('group', 'system') + # Skip hidden templates (bursts, internal, deprecated) + if template.get('hidden', False): + continue + + group = template.get('group', 'other') if group not in result: result[group] = [] - import re - # Clean title: remove {hostname}: prefix and any remaining {placeholders} - title = template['title'].replace('{hostname}', '').strip(': ') - title = re.sub(r'\s*\{[^}]+\}', '', title).strip(' -:') - if not title: - title = event_type.replace('_', ' ').title() + + # Use explicit label if available, otherwise derive from title + label = template.get('label', '') + if not label: + import re + label = template['title'].replace('{hostname}', '').strip(': ') + label = re.sub(r'\s*\{[^}]+\}', '', label).strip(' -:') + if not label: + label = event_type.replace('_', ' ').title() + result[group].append({ 'type': event_type, - 'title': title, + 'title': label, 'default_enabled': template.get('default_enabled', True), }) return result