From 3c64ee7af210ea767ffbeb6024799d2c021da3eb Mon Sep 17 00:00:00 2001 From: MacRimi Date: Fri, 27 Feb 2026 23:45:18 +0100 Subject: [PATCH] Update notification service --- AppImage/components/health-status-modal.tsx | 40 +- AppImage/components/notification-settings.tsx | 1511 ----------------- AppImage/components/settings.tsx | 4 - AppImage/components/storage-overview.tsx | 33 - AppImage/scripts/build_appimage.sh | 5 - AppImage/scripts/flask_notification_routes.py | 695 -------- AppImage/scripts/flask_server.py | 81 +- AppImage/scripts/health_monitor.py | 490 ++---- AppImage/scripts/health_persistence.py | 98 +- AppImage/scripts/notification_channels.py | 579 ------- AppImage/scripts/notification_events.py | 1301 -------------- AppImage/scripts/notification_manager.py | 1283 -------------- AppImage/scripts/notification_templates.py | 958 ----------- 13 files changed, 151 insertions(+), 6927 deletions(-) delete mode 100644 AppImage/components/notification-settings.tsx delete mode 100644 AppImage/scripts/flask_notification_routes.py delete mode 100644 AppImage/scripts/notification_channels.py delete mode 100644 AppImage/scripts/notification_events.py delete mode 100644 AppImage/scripts/notification_manager.py delete mode 100644 AppImage/scripts/notification_templates.py diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx index 439b946f..b6da6742 100644 --- a/AppImage/components/health-status-modal.tsx +++ b/AppImage/components/health-status-modal.tsx @@ -3,7 +3,6 @@ import type React from "react" import { useState, useEffect, useCallback } from "react" -import { fetchApi, getApiUrl, getAuthToken } from "@/lib/api-config" import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog" import { Badge } from "@/components/ui/badge" import { Button } from "@/components/ui/button" @@ -123,16 +122,10 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu let newOverallStatus = "OK" // Use the new combined endpoint for fewer round-trips - const token = getAuthToken() - const authHeaders: Record = {} - if (token) { - authHeaders["Authorization"] = `Bearer ${token}` - } - - const response = await fetch(getApiUrl("/api/health/full"), { headers: authHeaders }) + const response = await fetch(getApiUrl("/api/health/full")) if (!response.ok) { // Fallback to legacy endpoint - const legacyResponse = await fetch(getApiUrl("/api/health/details"), { headers: authHeaders }) + const legacyResponse = await fetch(getApiUrl("/api/health/details")) if (!legacyResponse.ok) throw new Error("Failed to fetch health details") const data = await legacyResponse.json() setHealthData(data) @@ -295,22 +288,15 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu setDismissingKey(errorKey) try { - const url = getApiUrl("/api/health/acknowledge") - const token = getAuthToken() - const headers: Record = { "Content-Type": "application/json" } - if (token) { - headers["Authorization"] = `Bearer ${token}` - } - - const response = await fetch(url, { + const response = await fetch(getApiUrl("/api/health/acknowledge"), { method: "POST", - headers, + headers: { "Content-Type": "application/json" }, body: JSON.stringify({ error_key: errorKey }), }) if (!response.ok) { - const errorData = await response.json().catch(() => ({})) - throw new Error(errorData.error || `Failed to dismiss error (${response.status})`) + const errorData = await response.json() + throw new Error(errorData.error || "Failed to dismiss error") } await fetchHealthDetails() @@ -422,10 +408,10 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu key={checkKey} className="flex items-center justify-between gap-1.5 sm:gap-2 text-[10px] sm:text-xs py-1.5 px-2 sm:px-3 rounded-md hover:bg-muted/40 transition-colors" > -
- {getStatusIcon(checkData.status, "sm")} +
+ {getStatusIcon(checkData.status, "sm")} {formatCheckLabel(checkKey)} - {checkData.detail} + {checkData.detail} {checkData.dismissed && ( Dismissed @@ -534,8 +520,8 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
{healthData.summary && healthData.summary !== "All systems operational" && ( -
-

{healthData.summary}

+
+

{healthData.summary}

)} @@ -573,7 +559,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu )}
{reason && !isExpanded && ( -

{reason}

+

{reason}

)}
@@ -592,7 +578,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu {isExpanded && (
{reason && ( -

{reason}

+

{reason}

)} {hasChecks ? ( renderChecks(checks, key) diff --git a/AppImage/components/notification-settings.tsx b/AppImage/components/notification-settings.tsx deleted file mode 100644 index 6d720da3..00000000 --- a/AppImage/components/notification-settings.tsx +++ /dev/null @@ -1,1511 +0,0 @@ -"use client" - -import { useState, useEffect, useCallback } from "react" -import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "./ui/card" -import { Tabs, TabsList, TabsTrigger, TabsContent } from "./ui/tabs" -import { Input } from "./ui/input" -import { Label } from "./ui/label" -import { Badge } from "./ui/badge" - -import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "./ui/select" -import { fetchApi } from "../lib/api-config" -import { - Bell, BellOff, Send, CheckCircle2, XCircle, Loader2, - AlertTriangle, Info, Settings2, Zap, Eye, EyeOff, - Trash2, ChevronDown, ChevronUp, ChevronRight, TestTube2, Mail, Webhook, - Copy, Server, Shield -} from "lucide-react" - -interface ChannelConfig { - enabled: boolean - bot_token?: string - chat_id?: string - url?: string - token?: string - webhook_url?: string - // Email channel fields - host?: string - port?: string - username?: string - password?: string - tls_mode?: string - from_address?: string - to_addresses?: string - subject_prefix?: string -} - -interface EventTypeInfo { - type: string - title: string - default_enabled: boolean -} - -interface NotificationConfig { - enabled: boolean - channels: Record - severity_filter: string - event_categories: Record - event_toggles: Record - event_types_by_group: Record - ai_enabled: boolean - ai_provider: string - ai_api_key: string - ai_model: string - hostname: string - webhook_secret: string - webhook_allowed_ips: string - pbs_host: string - pve_host: string - pbs_trusted_sources: string -} - -interface ServiceStatus { - enabled: boolean - running: boolean - channels: Record - queue_size: number - last_sent: string | null - total_sent_24h: number -} - -interface HistoryEntry { - id: number - event_type: string - channel: string - title: string - severity: string - sent_at: string - success: boolean - error_message: string | null -} - -const SEVERITY_OPTIONS = [ - { value: "critical", label: "Critical only" }, - { value: "warning", label: "Warning + Critical" }, - { value: "info", label: "All (Info + Warning + Critical)" }, -] - -const EVENT_CATEGORIES = [ - { key: "system", label: "System", desc: "Startup, shutdown, kernel events" }, - { key: "vm_ct", label: "VM / CT", desc: "Start, stop, crash, migration" }, - { key: "backup", label: "Backups", desc: "Backup start, complete, fail" }, - { key: "resources", label: "Resources", desc: "CPU, memory, temperature" }, - { key: "storage", label: "Storage", desc: "Disk space, I/O errors, SMART" }, - { key: "network", label: "Network", desc: "Connectivity, bond, latency" }, - { key: "security", label: "Security", desc: "Auth failures, fail2ban, firewall" }, - { key: "cluster", label: "Cluster", desc: "Quorum, split-brain, HA fencing" }, -] - -const AI_PROVIDERS = [ - { value: "openai", label: "OpenAI" }, - { value: "groq", label: "Groq" }, -] - -const DEFAULT_CONFIG: NotificationConfig = { - enabled: false, - channels: { - telegram: { enabled: false }, - gotify: { enabled: false }, - discord: { enabled: false }, - email: { enabled: false }, - }, - severity_filter: "all", - event_categories: { - system: true, vm_ct: true, backup: true, resources: true, - storage: true, network: true, security: true, cluster: true, - }, - event_toggles: {}, - event_types_by_group: {}, - ai_enabled: false, - ai_provider: "openai", - ai_api_key: "", - ai_model: "", - hostname: "", - webhook_secret: "", - webhook_allowed_ips: "", - pbs_host: "", - pve_host: "", - pbs_trusted_sources: "", -} - -export function NotificationSettings() { - const [config, setConfig] = useState(DEFAULT_CONFIG) - const [status, setStatus] = useState(null) - const [history, setHistory] = useState([]) - const [loading, setLoading] = useState(true) - const [saving, setSaving] = useState(false) - const [saved, setSaved] = useState(false) - const [testing, setTesting] = useState(null) - const [testResult, setTestResult] = useState<{ channel: string; success: boolean; message: string } | null>(null) - const [showHistory, setShowHistory] = useState(false) - const [showAdvanced, setShowAdvanced] = useState(false) - const [showSecrets, setShowSecrets] = useState>({}) - const [editMode, setEditMode] = useState(false) - const [hasChanges, setHasChanges] = useState(false) - const [expandedCategories, setExpandedCategories] = useState>(new Set()) - const [originalConfig, setOriginalConfig] = useState(DEFAULT_CONFIG) - const [webhookSetup, setWebhookSetup] = useState<{ - status: "idle" | "running" | "success" | "failed" - fallback_commands: string[] - error: string - }>({ status: "idle", fallback_commands: [], error: "" }) - - const loadConfig = useCallback(async () => { - try { - const data = await fetchApi<{ success: boolean; config: NotificationConfig }>("/api/notifications/settings") - if (data.success && data.config) { - setConfig(data.config) - setOriginalConfig(data.config) - } - } catch (err) { - console.error("Failed to load notification settings:", err) - } finally { - setLoading(false) - } - }, []) - - const loadStatus = useCallback(async () => { - try { - const data = await fetchApi<{ success: boolean } & ServiceStatus>("/api/notifications/status") - if (data.success) { - setStatus(data) - } - } catch { - // Service may not be running yet - } - }, []) - - const loadHistory = useCallback(async () => { - try { - const data = await fetchApi<{ success: boolean; history: HistoryEntry[]; total: number }>("/api/notifications/history?limit=20") - if (data.success) { - setHistory(data.history || []) - } - } catch { - // Ignore - } - }, []) - - useEffect(() => { - loadConfig() - loadStatus() - }, [loadConfig, loadStatus]) - - useEffect(() => { - if (showHistory) loadHistory() - }, [showHistory, loadHistory]) - - const updateConfig = (updater: (prev: NotificationConfig) => NotificationConfig) => { - setConfig(prev => { - const next = updater(prev) - setHasChanges(true) - return next - }) - } - - const updateChannel = (channel: string, field: string, value: string | boolean) => { - updateConfig(prev => ({ - ...prev, - channels: { - ...prev.channels, - [channel]: { ...prev.channels[channel], [field]: value }, - }, - })) - } - - /** Flatten the nested NotificationConfig into the flat key-value map the backend expects. */ - const flattenConfig = (cfg: NotificationConfig): Record => { - const flat: Record = { - enabled: String(cfg.enabled), - severity_filter: cfg.severity_filter, - ai_enabled: String(cfg.ai_enabled), - ai_provider: cfg.ai_provider, - ai_api_key: cfg.ai_api_key, - ai_model: cfg.ai_model, - hostname: cfg.hostname, - webhook_secret: cfg.webhook_secret, - webhook_allowed_ips: cfg.webhook_allowed_ips, - pbs_host: cfg.pbs_host, - pve_host: cfg.pve_host, - pbs_trusted_sources: cfg.pbs_trusted_sources, - } - // Flatten channels: { telegram: { enabled, bot_token, chat_id } } -> telegram.enabled, telegram.bot_token, ... - for (const [chName, chCfg] of Object.entries(cfg.channels)) { - for (const [field, value] of Object.entries(chCfg)) { - flat[`${chName}.${field}`] = String(value ?? "") - } - } - // Flatten event_categories: { system: true, backups: false } -> events.system, events.backups - for (const [cat, enabled] of Object.entries(cfg.event_categories)) { - flat[`events.${cat}`] = String(enabled) - } - // Flatten event_toggles: { vm_start: true, vm_stop: false } -> event.vm_start, event.vm_stop - // Always write ALL toggles to DB so the backend has an explicit record. - // This ensures default_enabled changes in templates don't get overridden by stale DB values. - if (cfg.event_toggles) { - for (const [evt, enabled] of Object.entries(cfg.event_toggles)) { - flat[`event.${evt}`] = String(enabled) - } - } - // Also write any events NOT in event_toggles using their template defaults. - // This covers newly added templates whose default_enabled may be false. - if (cfg.event_types_by_group) { - for (const events of Object.values(cfg.event_types_by_group)) { - for (const evt of (events as Array<{type: string, default_enabled: boolean}>)) { - const key = `event.${evt.type}` - if (!(key in flat)) { - flat[key] = String(evt.default_enabled) - } - } - } - } - return flat - } - - const handleSave = async () => { - setSaving(true) - try { - // If notifications are being disabled, clean up PVE webhook first - const wasEnabled = originalConfig.enabled - const isNowDisabled = !config.enabled - - if (wasEnabled && isNowDisabled) { - try { - await fetchApi("/api/notifications/proxmox/cleanup-webhook", { method: "POST" }) - } catch { - // Non-fatal: webhook cleanup failed but we still save settings - } - } - - const payload = flattenConfig(config) - await fetchApi("/api/notifications/settings", { - method: "POST", - body: JSON.stringify(payload), - }) - setOriginalConfig(config) - setHasChanges(false) - setEditMode(false) - setSaved(true) - setTimeout(() => setSaved(false), 3000) - loadStatus() - } catch (err) { - console.error("Failed to save notification settings:", err) - } finally { - setSaving(false) - } - } - - const handleCancel = () => { - setConfig(originalConfig) - setHasChanges(false) - setEditMode(false) - } - - const handleTest = async (channel: string) => { - setTesting(channel) - setTestResult(null) - try { - // Auto-save current config before testing so backend has latest channel data - const payload = flattenConfig(config) - await fetchApi("/api/notifications/settings", { - method: "POST", - body: JSON.stringify(payload), - }) - setOriginalConfig(config) - setHasChanges(false) - - const data = await fetchApi<{ - success: boolean - message?: string - error?: string - results?: Record - }>("/api/notifications/test", { - method: "POST", - body: JSON.stringify({ channel }), - }) - - // Extract message from the results object if present - let message = data.message || "" - if (!message && data.results) { - const channelResult = data.results[channel] - if (channelResult) { - message = channelResult.success - ? "Test notification sent successfully" - : channelResult.error || "Test failed" - } - } - if (!message && data.error) { - message = data.error - } - if (!message) { - message = data.success ? "Test notification sent successfully" : "Test failed" - } - - setTestResult({ channel, success: data.success, message }) - } catch (err) { - setTestResult({ channel, success: false, message: String(err) }) - } finally { - setTesting(null) - setTimeout(() => setTestResult(null), 8000) - } - } - - const handleClearHistory = async () => { - try { - await fetchApi("/api/notifications/history", { method: "DELETE" }) - setHistory([]) - } catch { - // Ignore - } - } - - const toggleSecret = (key: string) => { - setShowSecrets(prev => ({ ...prev, [key]: !prev[key] })) - } - - if (loading) { - return ( - - -
- - Notifications -
-
- -
-
-
- - - ) - } - - const activeChannels = Object.entries(config.channels).filter(([, ch]) => ch.enabled).length - - const handleEnable = async () => { - setSaving(true) - setWebhookSetup({ status: "running", fallback_commands: [], error: "" }) - try { - // 1) Save enabled=true - const newConfig = { ...config, enabled: true } - await fetchApi("/api/notifications/settings", { - method: "POST", - body: JSON.stringify(newConfig), - }) - setConfig(newConfig) - setOriginalConfig(newConfig) - - // 2) Auto-configure PVE webhook - try { - const setup = await fetchApi<{ - configured: boolean - secret?: string - fallback_commands?: string[] - error?: string - }>("/api/notifications/proxmox/setup-webhook", { method: "POST" }) - - if (setup.configured) { - setWebhookSetup({ status: "success", fallback_commands: [], error: "" }) - // Update secret in local config if one was generated - if (setup.secret) { - const updated = { ...newConfig, webhook_secret: setup.secret } - setConfig(updated) - setOriginalConfig(updated) - } - } else { - setWebhookSetup({ - status: "failed", - fallback_commands: setup.fallback_commands || [], - error: setup.error || "Unknown error", - }) - } - } catch { - setWebhookSetup({ - status: "failed", - fallback_commands: [], - error: "Could not reach setup endpoint", - }) - } - - setEditMode(true) - loadStatus() - } catch (err) { - console.error("Failed to enable notifications:", err) - setWebhookSetup({ status: "idle", fallback_commands: [], error: "" }) - } finally { - setSaving(false) - } - } - - // ── Disabled state: show activation card ── - if (!config.enabled && !editMode) { - return ( - - -
- - Notifications - - Disabled - -
- - Get real-time alerts about your Proxmox environment via Telegram, Discord, Gotify, or Email. - -
- -
-
-
- -
-

Enable notification service

-

- Monitor system health, VM/CT events, backups, security alerts, and cluster status. - PVE webhook integration is configured automatically. -

-
-
-
- -
- - {/* Webhook setup result */} - {webhookSetup.status === "success" && ( -
- -

- PVE webhook configured automatically. Proxmox will send notifications to ProxMenux. -

-
- )} - {webhookSetup.status === "failed" && ( -
-
- -
-

- Automatic PVE configuration failed: {webhookSetup.error} -

-

- Notifications are enabled. Run the commands below on the PVE host to complete webhook setup. -

-
-
- {webhookSetup.fallback_commands.length > 0 && ( -
-{webhookSetup.fallback_commands.join('\n')}
-                    
- )} -
- )} -
- - {/* PBS manual section (collapsible) */} -
- - - - Configure PBS notifications (manual) - -
-
-

- PVE backups launched from the PVE interface are covered automatically by the PVE webhook above. -

-

- However, PBS has its own internal jobs (Verify, Prune, GC, Sync) that generate - separate notifications. These must be configured directly on the PBS server. -

-
-
-

- Append to /etc/proxmox-backup/notifications.cfg on the PBS host: -

-
-{`webhook: proxmenux-webhook
-\tmethod post
-\turl http://:8008/api/notifications/webhook
-
-matcher: proxmenux-pbs
-\ttarget proxmenux-webhook
-\tmatch-severity warning,error`}
-                  
-
-
- -

- {"Replace with the IP of this PVE node (not 127.0.0.1, unless PBS runs on the same host). Append at the end -- do not delete existing content."} -

-
-
-
-
-
-
- ) - } - - return ( - - -
-
- - Notifications - {config.enabled && ( - - Active - - )} -
-
- {saved && ( - - - Saved - - )} - {editMode ? ( - <> - - - - ) : ( - - )} -
-
- - Configure notification channels and event filters. Receive alerts via Telegram, Gotify, Discord, or Email. - -
- - - {/* ── Service Status ── */} - {status && ( -
-
-
- - {status.running ? "Service running" : "Service stopped"} - - {status.total_sent_24h > 0 && ( - - {status.total_sent_24h} sent in last 24h - - )} -
- {activeChannels > 0 && ( - - {activeChannels} channel{activeChannels > 1 ? "s" : ""} - - )} -
- )} - - {/* ── Enable/Disable ── */} -
-
- {config.enabled ? ( - - ) : ( - - )} -
- Enable Notifications -

Activate the notification service

-
-
- -
- - {config.enabled && ( - <> - {/* ── Channel Configuration ── */} -
-
- - Channels -
- -
- - - - Telegram - - - Gotify - - - Discord - - - Email - - - - {/* Telegram */} - -
- - -
- {config.channels.telegram?.enabled && ( - <> -
- -
- updateChannel("telegram", "bot_token", e.target.value)} - /> - -
-
-
- - updateChannel("telegram", "chat_id", e.target.value)} - /> -
- {/* Per-channel action bar */} -
- - -
- - )} -
- - {/* Gotify */} - -
- - -
- {config.channels.gotify?.enabled && ( - <> -
- - updateChannel("gotify", "url", e.target.value)} - /> -
-
- -
- updateChannel("gotify", "token", e.target.value)} - /> - -
-
- {/* Per-channel action bar */} -
- - -
- - )} -
- - {/* Discord */} - -
- - -
- {config.channels.discord?.enabled && ( - <> -
- -
- updateChannel("discord", "webhook_url", e.target.value)} - /> - -
-
- {/* Per-channel action bar */} -
- - -
- - )} -
- - {/* Email */} - -
- - -
- {config.channels.email?.enabled && ( - <> -
-
- - updateChannel("email", "host", e.target.value)} - /> -
-
- - updateChannel("email", "port", e.target.value)} - /> -
-
-
- - -
-
-
- - updateChannel("email", "username", e.target.value)} - /> -
-
- -
- updateChannel("email", "password", e.target.value)} - /> - -
-
-
-
- - updateChannel("email", "from_address", e.target.value)} - /> -
-
- - updateChannel("email", "to_addresses", e.target.value)} - /> -
-
- - updateChannel("email", "subject_prefix", e.target.value)} - /> -
-
- -

- Leave SMTP Host empty to use local sendmail (must be installed on the server). - For Gmail, use an App Password instead of your account password. -

-
- {/* Per-channel action bar */} -
- - -
- - )} -
-
- - {/* Test Result */} - {testResult && ( -
- {testResult.success ? ( - - ) : ( - - )} - {testResult.message} -
- )} -
{/* close bordered channel container */} -
- - {/* ── Filters ── */} -
-
- - Filters & Events -
-
- {/* Severity */} -
- - -
- - {/* Event Categories */} -
- -
- {EVENT_CATEGORIES.map(cat => { - const isEnabled = config.event_categories[cat.key] ?? true - const isExpanded = expandedCategories.has(cat.key) - const eventsForGroup = config.event_types_by_group?.[cat.key] || [] - const enabledCount = eventsForGroup.filter(e => config.event_toggles?.[e.type] ?? e.default_enabled).length - - return ( -
- {/* Category header row */} -
- {/* Expand/collapse button */} - - - {/* Label + description */} -
- - {cat.label} - - {cat.desc} -
- - {/* Count badge */} - {isEnabled && eventsForGroup.length > 0 && ( - - {enabledCount}/{eventsForGroup.length} - - )} - - {/* Category toggle */} - -
- - {/* Per-event toggles (expanded) */} - {isEnabled && isExpanded && eventsForGroup.length > 0 && ( -
- {eventsForGroup.map(evt => { - const evtEnabled = config.event_toggles?.[evt.type] ?? evt.default_enabled - return ( -
- - {evt.title} - - -
- ) - })} -
- )} -
- ) - })} -
-
-
{/* close bordered filters container */} -
- - {/* ── Proxmox Webhook ── */} -
-
- - Proxmox Webhook -
-
-
-
- PVE Webhook Configuration -
- {!editMode && ( - - )} -
- - {/* Setup status inline */} - {webhookSetup.status === "success" && ( -
- -

PVE webhook configured successfully.

-
- )} - {webhookSetup.status === "failed" && ( -
-
- -

PVE auto-config failed: {webhookSetup.error}

-
- {webhookSetup.fallback_commands.length > 0 && ( -
-{webhookSetup.fallback_commands.join('\n')}
-                    
- )} -
- )} - -
- -
- updateConfig(p => ({ ...p, webhook_secret: e.target.value }))} - disabled={!editMode} - /> - -
-

- {"Used for remote connections only (e.g. PBS on another host). Local PVE webhook runs on localhost and does not need this header."} -

-
-
- - updateConfig(p => ({ ...p, webhook_allowed_ips: e.target.value }))} - disabled={!editMode} - /> -

- {"Localhost (127.0.0.1) is always allowed. This restricts remote callers only."} -

-
-
{/* close bordered webhook container */} - - {/* PBS manual guide (collapsible) */} -
- - - Configure PBS notifications (manual) - -
-

- Backups launched from PVE are covered by the PVE webhook. PBS internal jobs - (Verify, Prune, GC, Sync) require separate configuration on the PBS server. -

-

- Append to /etc/proxmox-backup/notifications.cfg: -

-
-{`webhook: proxmenux-webhook
-\tmethod post
-\turl http://:8008/api/notifications/webhook
-
-matcher: proxmenux-pbs
-\ttarget proxmenux-webhook
-\tmatch-severity warning,error`}
-                  
-

- {"Replace with this node's IP. Append at the end -- do not delete existing content."} -

-
-
-
- - {/* ── Advanced: AI Enhancement ── */} -
- - - {showAdvanced && ( -
-
-
- AI-Enhanced Messages -

Use AI to generate contextual notification messages

-
- -
- - {config.ai_enabled && ( - <> -
- - -
-
- -
- updateConfig(p => ({ ...p, ai_api_key: e.target.value }))} - disabled={!editMode} - /> - -
-
-
- - updateConfig(p => ({ ...p, ai_model: e.target.value }))} - disabled={!editMode} - /> -
-
- -

- AI enhancement is optional. When enabled, notifications include contextual analysis and recommended actions. If the AI service is unavailable, standard templates are used as fallback. -

-
- - )} -
- )} -
- - {/* ── Notification History ── */} -
- - - {showHistory && ( -
- {history.length === 0 ? ( -

No notifications sent yet

- ) : ( - <> -
- -
-
- {history.map(entry => ( -
- {entry.success ? ( - - ) : ( - - )} -
- {entry.title || entry.event_type} - - {entry.channel} - {new Date(entry.sent_at).toLocaleString()} - -
- - {entry.severity} - -
- ))} -
- - )} -
- )} -
- - )} - - {/* ── Footer info ── */} -
- -

- {config.enabled - ? "Notifications are active. Events matching your severity filter and category selection will be sent to configured channels." - : "Enable notifications to receive alerts about system events, health status changes, and security incidents via Telegram, Gotify, Discord, or Email."} -

-
- - - ) -} diff --git a/AppImage/components/settings.tsx b/AppImage/components/settings.tsx index f2631177..4f037221 100644 --- a/AppImage/components/settings.tsx +++ b/AppImage/components/settings.tsx @@ -3,7 +3,6 @@ import { useState, useEffect } from "react" import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "./ui/card" import { Wrench, Package, Ruler, HeartPulse, Cpu, MemoryStick, HardDrive, CircleDot, Network, Server, Settings2, FileText, RefreshCw, Shield, AlertTriangle, Info, Loader2, Check } from "lucide-react" -import { NotificationSettings } from "./notification-settings" import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "./ui/select" import { Input } from "./ui/input" import { Badge } from "./ui/badge" @@ -439,9 +438,6 @@ export function Settings() { - {/* Notification Settings */} - - {/* ProxMenux Optimizations */} diff --git a/AppImage/components/storage-overview.tsx b/AppImage/components/storage-overview.tsx index 648993b9..a9a4cf79 100644 --- a/AppImage/components/storage-overview.tsx +++ b/AppImage/components/storage-overview.tsx @@ -34,12 +34,6 @@ interface DiskInfo { wear_leveling_count?: number // SSD: Wear Leveling Count total_lbas_written?: number // SSD/NVMe: Total LBAs Written (GB) ssd_life_left?: number // SSD: SSD Life Left percentage - io_errors?: { - count: number - severity: string - sample: string - reason: string - } } interface ZFSPool { @@ -782,17 +776,6 @@ export function StorageOverview() {
- {disk.io_errors && disk.io_errors.count > 0 && ( -
- - {disk.io_errors.count} I/O error{disk.io_errors.count !== 1 ? 's' : ''} in 5 min -
- )} -
{disk.size_formatted && (
@@ -858,22 +841,6 @@ export function StorageOverview() {
- {disk.io_errors && disk.io_errors.count > 0 && ( -
- -
- {disk.io_errors.count} I/O error{disk.io_errors.count !== 1 ? 's' : ''} in 5 min - {disk.io_errors.sample && ( -

{disk.io_errors.sample}

- )} -
-
- )} -
{disk.size_formatted && (
diff --git a/AppImage/scripts/build_appimage.sh b/AppImage/scripts/build_appimage.sh index 3b5f3090..447dd60e 100644 --- a/AppImage/scripts/build_appimage.sh +++ b/AppImage/scripts/build_appimage.sh @@ -91,11 +91,6 @@ cp "$SCRIPT_DIR/proxmox_storage_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || e cp "$SCRIPT_DIR/flask_script_runner.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_script_runner.py not found" cp "$SCRIPT_DIR/security_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ security_manager.py not found" cp "$SCRIPT_DIR/flask_security_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_security_routes.py not found" -cp "$SCRIPT_DIR/notification_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_manager.py not found" -cp "$SCRIPT_DIR/notification_channels.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_channels.py not found" -cp "$SCRIPT_DIR/notification_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_templates.py not found" -cp "$SCRIPT_DIR/notification_events.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_events.py not found" -cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_notification_routes.py not found" echo "📋 Adding translation support..." cat > "$APP_DIR/usr/bin/translate_cli.py" << 'PYEOF' diff --git a/AppImage/scripts/flask_notification_routes.py b/AppImage/scripts/flask_notification_routes.py deleted file mode 100644 index c0a79c63..00000000 --- a/AppImage/scripts/flask_notification_routes.py +++ /dev/null @@ -1,695 +0,0 @@ -""" -Flask routes for notification service configuration and management. -Blueprint pattern matching flask_health_routes.py / flask_security_routes.py. -""" - -import hmac -import time -import hashlib -from collections import deque -from flask import Blueprint, jsonify, request -from notification_manager import notification_manager - - -# ─── Webhook Hardening Helpers ─────────────────────────────────── - -class WebhookRateLimiter: - """Simple sliding-window rate limiter for the webhook endpoint.""" - - def __init__(self, max_requests: int = 60, window_seconds: int = 60): - self._max = max_requests - self._window = window_seconds - self._timestamps: deque = deque() - - def allow(self) -> bool: - now = time.time() - # Prune entries outside the window - while self._timestamps and now - self._timestamps[0] > self._window: - self._timestamps.popleft() - if len(self._timestamps) >= self._max: - return False - self._timestamps.append(now) - return True - - -class ReplayCache: - """Bounded in-memory cache of recently seen request signatures (60s TTL).""" - - _MAX_SIZE = 2000 # Hard cap to prevent memory growth - - def __init__(self, ttl: int = 60): - self._ttl = ttl - self._seen: dict = {} # signature -> timestamp - - def check_and_record(self, signature: str) -> bool: - """Return True if this signature was already seen (replay). Records it otherwise.""" - now = time.time() - # Periodic cleanup - if len(self._seen) > self._MAX_SIZE // 2: - cutoff = now - self._ttl - self._seen = {k: v for k, v in self._seen.items() if v > cutoff} - if signature in self._seen and now - self._seen[signature] < self._ttl: - return True # Replay detected - self._seen[signature] = now - return False - - -# Module-level singletons (one per process) -_webhook_limiter = WebhookRateLimiter(max_requests=60, window_seconds=60) -_replay_cache = ReplayCache(ttl=60) - -# Timestamp validation window (seconds) -_TIMESTAMP_MAX_DRIFT = 60 - -notification_bp = Blueprint('notifications', __name__) - - -@notification_bp.route('/api/notifications/settings', methods=['GET']) -def get_notification_settings(): - """Get all notification settings for the UI.""" - try: - settings = notification_manager.get_settings() - return jsonify(settings) - except Exception as e: - return jsonify({'error': str(e)}), 500 - - -@notification_bp.route('/api/notifications/settings', methods=['POST']) -def save_notification_settings(): - """Save notification settings from the UI.""" - try: - payload = request.get_json() - if not payload: - return jsonify({'error': 'No data provided'}), 400 - - result = notification_manager.save_settings(payload) - return jsonify(result) - except Exception as e: - return jsonify({'error': str(e)}), 500 - - -@notification_bp.route('/api/notifications/test', methods=['POST']) -def test_notification(): - """Send a test notification to one or all channels.""" - try: - data = request.get_json() or {} - channel = data.get('channel', 'all') - - result = notification_manager.test_channel(channel) - return jsonify(result) - except Exception as e: - return jsonify({'error': str(e)}), 500 - - -@notification_bp.route('/api/notifications/status', methods=['GET']) -def get_notification_status(): - """Get notification service status.""" - try: - status = notification_manager.get_status() - return jsonify(status) - except Exception as e: - return jsonify({'error': str(e)}), 500 - - -@notification_bp.route('/api/notifications/history', methods=['GET']) -def get_notification_history(): - """Get notification history with optional filters.""" - try: - limit = request.args.get('limit', 100, type=int) - offset = request.args.get('offset', 0, type=int) - severity = request.args.get('severity', '') - channel = request.args.get('channel', '') - - result = notification_manager.get_history(limit, offset, severity, channel) - return jsonify(result) - except Exception as e: - return jsonify({'error': str(e)}), 500 - - -@notification_bp.route('/api/notifications/history', methods=['DELETE']) -def clear_notification_history(): - """Clear all notification history.""" - try: - result = notification_manager.clear_history() - return jsonify(result) - except Exception as e: - return jsonify({'error': str(e)}), 500 - - -@notification_bp.route('/api/notifications/send', methods=['POST']) -def send_notification(): - """Send a notification via API (for testing or external triggers).""" - try: - data = request.get_json() - if not data: - return jsonify({'error': 'No data provided'}), 400 - - result = notification_manager.send_notification( - event_type=data.get('event_type', 'custom'), - severity=data.get('severity', 'INFO'), - title=data.get('title', ''), - message=data.get('message', ''), - data=data.get('data', {}), - source='api' - ) - return jsonify(result) - except Exception as e: - return jsonify({'error': str(e)}), 500 - - -# ── PVE config constants ── -_PVE_ENDPOINT_ID = 'proxmenux-webhook' -_PVE_MATCHER_ID = 'proxmenux-default' -_PVE_WEBHOOK_URL = 'http://127.0.0.1:8008/api/notifications/webhook' -_PVE_NOTIFICATIONS_CFG = '/etc/pve/notifications.cfg' -_PVE_PRIV_CFG = '/etc/pve/priv/notifications.cfg' -_PVE_OUR_HEADERS = { - f'webhook: {_PVE_ENDPOINT_ID}', - f'matcher: {_PVE_MATCHER_ID}', -} - - -def _pve_read_file(path): - """Read file, return (content, error). Content is '' if missing.""" - try: - with open(path, 'r') as f: - return f.read(), None - except FileNotFoundError: - return '', None - except PermissionError: - return None, f'Permission denied reading {path}' - except Exception as e: - return None, str(e) - - -def _pve_backup_file(path): - """Create timestamped backup if file exists. Never fails fatally.""" - import os, shutil - from datetime import datetime - try: - if os.path.exists(path): - ts = datetime.now().strftime('%Y%m%d_%H%M%S') - backup = f"{path}.proxmenux_backup_{ts}" - shutil.copy2(path, backup) - except Exception: - pass - - -def _pve_remove_our_blocks(text, headers_to_remove): - """Remove only blocks whose header line matches one of ours. - - Preserves ALL other content byte-for-byte. - A block = header line + indented continuation lines + trailing blank line. - """ - lines = text.splitlines(keepends=True) - cleaned = [] - skip_block = False - - for line in lines: - stripped = line.strip() - - if stripped and not line[0:1].isspace() and ':' in stripped: - if stripped in headers_to_remove: - skip_block = True - continue - else: - skip_block = False - - if skip_block: - if not stripped: - skip_block = False - continue - elif line[0:1].isspace(): - continue - else: - skip_block = False - - cleaned.append(line) - - return ''.join(cleaned) - - -def _build_webhook_fallback(): - """Build fallback manual commands for webhook setup.""" - import base64 - body_tpl = '{"title":"{{ escape title }}","message":"{{ escape message }}","severity":"{{ severity }}","timestamp":"{{ timestamp }}","fields":{{ json fields }}}' - body_b64 = base64.b64encode(body_tpl.encode()).decode() - return [ - "# 1. Append to END of /etc/pve/notifications.cfg", - "# (do NOT delete existing content):", - "", - f"webhook: {_PVE_ENDPOINT_ID}", - f"\tbody {body_b64}", - f"\tmethod post", - f"\turl {_PVE_WEBHOOK_URL}", - "", - f"matcher: {_PVE_MATCHER_ID}", - f"\ttarget {_PVE_ENDPOINT_ID}", - "\tmode all", - "", - "# 2. Append to /etc/pve/priv/notifications.cfg :", - f"webhook: {_PVE_ENDPOINT_ID}", - ] - - -def setup_pve_webhook_core() -> dict: - """Core logic to configure PVE webhook. Callable from anywhere. - - Returns dict with 'configured', 'error', 'fallback_commands' keys. - Idempotent: safe to call multiple times. - """ - import secrets as secrets_mod - - result = { - 'configured': False, - 'endpoint_id': _PVE_ENDPOINT_ID, - 'matcher_id': _PVE_MATCHER_ID, - 'url': _PVE_WEBHOOK_URL, - 'fallback_commands': [], - 'error': None, - } - - try: - # ── Step 1: Ensure webhook secret exists (for our own internal use) ── - secret = notification_manager.get_webhook_secret() - if not secret: - secret = secrets_mod.token_urlsafe(32) - notification_manager._save_setting('webhook_secret', secret) - - # ── Step 2: Read main config ── - cfg_text, err = _pve_read_file(_PVE_NOTIFICATIONS_CFG) - if err: - result['error'] = err - result['fallback_commands'] = _build_webhook_fallback() - return result - - # ── Step 3: Read priv config (to clean up any broken blocks we wrote before) ── - priv_text, err = _pve_read_file(_PVE_PRIV_CFG) - if err: - priv_text = None - - # ── Step 4: Create backups before ANY modification ── - _pve_backup_file(_PVE_NOTIFICATIONS_CFG) - if priv_text is not None: - _pve_backup_file(_PVE_PRIV_CFG) - - # ── Step 5: Remove any previous proxmenux blocks from BOTH files ── - cleaned_cfg = _pve_remove_our_blocks(cfg_text, _PVE_OUR_HEADERS) - - if priv_text is not None: - cleaned_priv = _pve_remove_our_blocks(priv_text, _PVE_OUR_HEADERS) - - # ── Step 6: Build new blocks ── - # Exact format from a real working PVE server: - # webhook: name - # \tmethod post - # \turl http://... - # - # NO header lines -- localhost webhook doesn't need them. - # PVE header format is: header name=X-Key,value= - # PVE secret format is: secret name=key,value= - # Neither is needed for localhost calls. - - # PVE stores body as base64 in the config file. - # {{ escape title/message }} -- JSON-safe escaping of quotes/newlines. - # {{ json fields }} -- renders ALL PVE metadata as a JSON object - # (type, hostname, job-id). This is a single Handlebars helper - # that always works, even if fields is empty (renders {}). - import base64 - body_template = '{"title":"{{ escape title }}","message":"{{ escape message }}","severity":"{{ severity }}","timestamp":"{{ timestamp }}","fields":{{ json fields }}}' - body_b64 = base64.b64encode(body_template.encode()).decode() - - endpoint_block = ( - f"webhook: {_PVE_ENDPOINT_ID}\n" - f"\tbody {body_b64}\n" - f"\tmethod post\n" - f"\turl {_PVE_WEBHOOK_URL}\n" - ) - - matcher_block = ( - f"matcher: {_PVE_MATCHER_ID}\n" - f"\ttarget {_PVE_ENDPOINT_ID}\n" - f"\tmode all\n" - ) - - # ── Step 7: Append our blocks to cleaned main config ── - if cleaned_cfg and not cleaned_cfg.endswith('\n'): - cleaned_cfg += '\n' - if cleaned_cfg and not cleaned_cfg.endswith('\n\n'): - cleaned_cfg += '\n' - - new_cfg = cleaned_cfg + endpoint_block + '\n' + matcher_block - - # ── Step 8: Write main config ── - try: - with open(_PVE_NOTIFICATIONS_CFG, 'w') as f: - f.write(new_cfg) - except PermissionError: - result['error'] = f'Permission denied writing {_PVE_NOTIFICATIONS_CFG}' - result['fallback_commands'] = _build_webhook_fallback() - return result - except Exception as e: - try: - with open(_PVE_NOTIFICATIONS_CFG, 'w') as f: - f.write(cfg_text) - except Exception: - pass - result['error'] = str(e) - result['fallback_commands'] = _build_webhook_fallback() - return result - - # ── Step 9: Write priv config with our webhook entry ── - # PVE REQUIRES a matching block in priv/notifications.cfg for every - # webhook endpoint, even if it has no secrets. Without it PVE throws: - # "Could not instantiate endpoint: private config does not exist" - priv_block = ( - f"webhook: {_PVE_ENDPOINT_ID}\n" - ) - - if priv_text is not None: - # Start from cleaned priv (our old blocks removed) - if cleaned_priv and not cleaned_priv.endswith('\n'): - cleaned_priv += '\n' - if cleaned_priv and not cleaned_priv.endswith('\n\n'): - cleaned_priv += '\n' - new_priv = cleaned_priv + priv_block - else: - new_priv = priv_block - - try: - with open(_PVE_PRIV_CFG, 'w') as f: - f.write(new_priv) - except PermissionError: - result['error'] = f'Permission denied writing {_PVE_PRIV_CFG}' - result['fallback_commands'] = _build_webhook_fallback() - return result - except Exception: - pass - - result['configured'] = True - result['secret'] = secret - return result - - except Exception as e: - result['error'] = str(e) - result['fallback_commands'] = _build_webhook_fallback() - return result - - -@notification_bp.route('/api/notifications/proxmox/setup-webhook', methods=['POST']) -def setup_proxmox_webhook(): - """HTTP endpoint wrapper for webhook setup.""" - return jsonify(setup_pve_webhook_core()), 200 - - -def cleanup_pve_webhook_core() -> dict: - """Core logic to remove PVE webhook blocks. Callable from anywhere. - - Returns dict with 'cleaned', 'error' keys. - Only removes blocks named 'proxmenux-webhook' / 'proxmenux-default'. - """ - result = {'cleaned': False, 'error': None} - - try: - # Read both files - cfg_text, err = _pve_read_file(_PVE_NOTIFICATIONS_CFG) - if err: - result['error'] = err - return result - - priv_text, err = _pve_read_file(_PVE_PRIV_CFG) - if err: - priv_text = None - - # Check if our blocks actually exist before doing anything - has_our_blocks = any( - h in cfg_text for h in [f'webhook: {_PVE_ENDPOINT_ID}', f'matcher: {_PVE_MATCHER_ID}'] - ) - has_priv_blocks = priv_text and f'webhook: {_PVE_ENDPOINT_ID}' in priv_text - - if not has_our_blocks and not has_priv_blocks: - result['cleaned'] = True - return result - - # Backup before modification - _pve_backup_file(_PVE_NOTIFICATIONS_CFG) - if priv_text is not None: - _pve_backup_file(_PVE_PRIV_CFG) - - # Remove our blocks - if has_our_blocks: - cleaned_cfg = _pve_remove_our_blocks(cfg_text, _PVE_OUR_HEADERS) - try: - with open(_PVE_NOTIFICATIONS_CFG, 'w') as f: - f.write(cleaned_cfg) - except PermissionError: - result['error'] = f'Permission denied writing {_PVE_NOTIFICATIONS_CFG}' - return result - except Exception as e: - # Rollback - try: - with open(_PVE_NOTIFICATIONS_CFG, 'w') as f: - f.write(cfg_text) - except Exception: - pass - result['error'] = str(e) - return result - - if has_priv_blocks and priv_text is not None: - cleaned_priv = _pve_remove_our_blocks(priv_text, _PVE_OUR_HEADERS) - try: - with open(_PVE_PRIV_CFG, 'w') as f: - f.write(cleaned_priv) - except Exception: - pass # Best-effort - - result['cleaned'] = True - return result - - except Exception as e: - result['error'] = str(e) - return result - - -@notification_bp.route('/api/notifications/proxmox/cleanup-webhook', methods=['POST']) -def cleanup_proxmox_webhook(): - """HTTP endpoint wrapper for webhook cleanup.""" - return jsonify(cleanup_pve_webhook_core()), 200 - - -@notification_bp.route('/api/notifications/proxmox/read-cfg', methods=['GET']) -def read_pve_notification_cfg(): - """Diagnostic: return raw content of PVE notification config files. - - GET /api/notifications/proxmox/read-cfg - Returns both notifications.cfg and priv/notifications.cfg content. - """ - import os - - files = { - 'notifications_cfg': '/etc/pve/notifications.cfg', - 'priv_cfg': '/etc/pve/priv/notifications.cfg', - } - - # Also look for any backups we created - backup_dir = '/etc/pve' - priv_backup_dir = '/etc/pve/priv' - - result = {} - for key, path in files.items(): - try: - with open(path, 'r') as f: - result[key] = { - 'path': path, - 'content': f.read(), - 'size': os.path.getsize(path), - 'error': None, - } - except FileNotFoundError: - result[key] = {'path': path, 'content': None, 'size': 0, 'error': 'file_not_found'} - except PermissionError: - result[key] = {'path': path, 'content': None, 'size': 0, 'error': 'permission_denied'} - except Exception as e: - result[key] = {'path': path, 'content': None, 'size': 0, 'error': str(e)} - - # Find backups - backups = [] - for d in [backup_dir, priv_backup_dir]: - try: - for fname in sorted(os.listdir(d)): - if 'proxmenux_backup' in fname: - fpath = os.path.join(d, fname) - try: - with open(fpath, 'r') as f: - backups.append({ - 'path': fpath, - 'content': f.read(), - 'size': os.path.getsize(fpath), - }) - except Exception: - backups.append({'path': fpath, 'content': None, 'error': 'read_failed'}) - except Exception: - pass - - result['backups'] = backups - return jsonify(result), 200 - - -@notification_bp.route('/api/notifications/proxmox/restore-cfg', methods=['POST']) -def restore_pve_notification_cfg(): - """Restore PVE notification config from our backup. - - POST /api/notifications/proxmox/restore-cfg - Finds the most recent proxmenux_backup and restores it. - """ - import os - import shutil - - files_to_restore = { - '/etc/pve': '/etc/pve/notifications.cfg', - '/etc/pve/priv': '/etc/pve/priv/notifications.cfg', - } - - restored = [] - errors = [] - - for search_dir, target_path in files_to_restore.items(): - try: - candidates = sorted([ - f for f in os.listdir(search_dir) - if 'proxmenux_backup' in f and f.startswith('notifications.cfg') - ], reverse=True) - - if candidates: - backup_path = os.path.join(search_dir, candidates[0]) - shutil.copy2(backup_path, target_path) - restored.append({'target': target_path, 'from_backup': backup_path}) - else: - errors.append({'target': target_path, 'error': 'no_backup_found'}) - except Exception as e: - errors.append({'target': target_path, 'error': str(e)}) - - return jsonify({ - 'restored': restored, - 'errors': errors, - 'success': len(errors) == 0 and len(restored) > 0, - }), 200 - - -@notification_bp.route('/api/notifications/webhook', methods=['POST']) -def proxmox_webhook(): - """Receive native Proxmox VE notification webhooks (hardened). - - Security layers: - Localhost (127.0.0.1 / ::1): rate limiting only. - PVE calls us on localhost and cannot send custom auth headers, - so we trust the loopback interface (only local processes can reach it). - Remote: rate limiting + shared secret + timestamp + replay + IP allowlist. - """ - _reject = lambda code, error, status: (jsonify({'accepted': False, 'error': error}), status) - - client_ip = request.remote_addr or '' - is_localhost = client_ip in ('127.0.0.1', '::1') - - # ── Layer 1: Rate limiting (always) ── - if not _webhook_limiter.allow(): - resp = jsonify({'accepted': False, 'error': 'rate_limited'}) - resp.headers['Retry-After'] = '60' - return resp, 429 - - # ── Layers 2-5: Remote-only checks ── - if not is_localhost: - # Layer 2: Shared secret - try: - configured_secret = notification_manager.get_webhook_secret() - except Exception: - configured_secret = '' - - if configured_secret: - request_secret = request.headers.get('X-Webhook-Secret', '') - if not request_secret: - return _reject(401, 'missing_secret', 401) - if not hmac.compare_digest(configured_secret, request_secret): - return _reject(401, 'invalid_secret', 401) - - # Layer 3: Anti-replay timestamp - ts_header = request.headers.get('X-ProxMenux-Timestamp', '') - if not ts_header: - return _reject(401, 'missing_timestamp', 401) - try: - ts_value = int(ts_header) - except (ValueError, TypeError): - return _reject(401, 'invalid_timestamp', 401) - if abs(time.time() - ts_value) > _TIMESTAMP_MAX_DRIFT: - return _reject(401, 'timestamp_expired', 401) - - # Layer 4: Replay cache - raw_body = request.get_data(as_text=True) or '' - signature = hashlib.sha256(f"{ts_value}:{raw_body}".encode(errors='replace')).hexdigest() - if _replay_cache.check_and_record(signature): - return _reject(409, 'replay_detected', 409) - - # Layer 5: IP allowlist - try: - allowed_ips = notification_manager.get_webhook_allowed_ips() - if allowed_ips and client_ip not in allowed_ips: - return _reject(403, 'forbidden_ip', 403) - except Exception: - pass - - # ── Parse and process payload ── - try: - content_type = request.content_type or '' - raw_data = request.get_data(as_text=True) or '' - - # Try JSON first - payload = request.get_json(silent=True) or {} - - # If not JSON, try form data - if not payload: - payload = dict(request.form) - - # If still empty, try parsing raw data as JSON (PVE may not set Content-Type) - if not payload and raw_data: - import json - try: - payload = json.loads(raw_data) - except (json.JSONDecodeError, ValueError): - # PVE's {{ message }} may contain unescaped newlines/quotes - # that break JSON. Try to repair common issues. - try: - repaired = raw_data.replace('\n', '\\n').replace('\r', '\\r') - payload = json.loads(repaired) - except (json.JSONDecodeError, ValueError): - # Try to extract fields with regex from broken JSON - import re - title_m = re.search(r'"title"\s*:\s*"([^"]*)"', raw_data) - sev_m = re.search(r'"severity"\s*:\s*"([^"]*)"', raw_data) - if title_m: - payload = { - 'title': title_m.group(1), - 'body': raw_data[:1000], - 'severity': sev_m.group(1) if sev_m else 'info', - 'source': 'proxmox_hook', - } - - # If still empty, try to salvage data from raw body - if not payload: - if raw_data: - # Last resort: treat raw text as the message body - payload = { - 'title': 'PVE Notification', - 'body': raw_data[:1000], - 'severity': 'info', - 'source': 'proxmox_hook', - } - else: - return _reject(400, 'empty_payload', 400) - - result = notification_manager.process_webhook(payload) - # Always return 200 to PVE -- a non-200 makes PVE report the webhook as broken. - # The 'accepted' field in the JSON body indicates actual processing status. - return jsonify(result), 200 - except Exception as e: - # Still return 200 to avoid PVE flagging the webhook as broken - return jsonify({'accepted': False, 'error': 'internal_error', 'detail': str(e)}), 200 diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index dd38090e..003d15f4 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -23,7 +23,6 @@ import time import threading import urllib.parse import hardware_monitor -import health_persistence import xml.etree.ElementTree as ET from datetime import datetime, timedelta from functools import wraps @@ -47,8 +46,6 @@ from flask_health_routes import health_bp # noqa: E402 from flask_auth_routes import auth_bp # noqa: E402 from flask_proxmenux_routes import proxmenux_bp # noqa: E402 from flask_security_routes import security_bp # noqa: E402 -from flask_notification_routes import notification_bp # noqa: E402 -from notification_manager import notification_manager # noqa: E402 from jwt_middleware import require_auth # noqa: E402 import auth_manager # noqa: E402 @@ -123,7 +120,6 @@ app.register_blueprint(auth_bp) app.register_blueprint(health_bp) app.register_blueprint(proxmenux_bp) app.register_blueprint(security_bp) -app.register_blueprint(notification_bp) # Initialize terminal / WebSocket routes init_terminal_routes(app) @@ -1160,66 +1156,19 @@ def get_storage_info(): 'ssd_life_left': smart_data.get('ssd_life_left') # Added } + storage_data['disk_count'] += 1 + health = smart_data.get('health', 'unknown').lower() + if health == 'healthy': + storage_data['healthy_disks'] += 1 + elif health == 'warning': + storage_data['warning_disks'] += 1 + elif health in ['critical', 'failed']: + storage_data['critical_disks'] += 1 + except Exception as e: + # print(f"Error getting disk list: {e}") pass - # Enrich physical disks with active I/O errors from health_persistence. - # This is the single source of truth -- health_monitor detects ATA/SCSI/IO - # errors via dmesg, records them in health_persistence, and we read them here. - try: - active_disk_errors = health_persistence.get_active_errors(category='disks') - for err in active_disk_errors: - details = err.get('details', {}) - if isinstance(details, str): - try: - details = json.loads(details) - except (json.JSONDecodeError, TypeError): - details = {} - - err_device = details.get('disk', '') - error_count = details.get('error_count', 0) - sample = details.get('sample', '') - severity = err.get('severity', 'WARNING') - - # Match error to physical disk. - # err_device can be 'sda', 'nvme0n1', or 'ata8' (if resolution failed) - matched_disk = None - if err_device in physical_disks: - matched_disk = err_device - else: - # Try partial match: 'sda' matches disk 'sda' - for dk in physical_disks: - if dk == err_device or err_device.startswith(dk): - matched_disk = dk - break - - if matched_disk: - physical_disks[matched_disk]['io_errors'] = { - 'count': error_count, - 'severity': severity, - 'sample': sample, - 'reason': err.get('reason', ''), - } - # Override health status if I/O errors are more severe - current_health = physical_disks[matched_disk].get('health', 'unknown').lower() - if severity == 'CRITICAL' and current_health != 'critical': - physical_disks[matched_disk]['health'] = 'critical' - elif severity == 'WARNING' and current_health in ('healthy', 'unknown'): - physical_disks[matched_disk]['health'] = 'warning' - except Exception: - pass - - # Count disk health states AFTER I/O error enrichment - for disk_name, disk_info in physical_disks.items(): - storage_data['disk_count'] += 1 - health = disk_info.get('health', 'unknown').lower() - if health == 'healthy': - storage_data['healthy_disks'] += 1 - elif health == 'warning': - storage_data['warning_disks'] += 1 - elif health in ['critical', 'failed']: - storage_data['critical_disks'] += 1 - storage_data['total'] = round(total_disk_size_bytes / (1024**4), 1) # Get disk usage for mounted partitions @@ -7145,16 +7094,6 @@ if __name__ == '__main__': except Exception as e: print(f"[ProxMenux] Vital signs sampler failed to start: {e}") - # ── Notification Service ── - try: - notification_manager.start() - if notification_manager._enabled: - print(f"[ProxMenux] Notification service started (channels: {list(notification_manager._channels.keys())})") - else: - print("[ProxMenux] Notification service loaded (disabled - configure in Settings)") - except Exception as e: - print(f"[ProxMenux] Notification service failed to start: {e}") - # Check for SSL configuration ssl_ctx = None try: diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 30eb884b..ef381192 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -324,13 +324,6 @@ class HealthMonitor: Returns JSON structure with ALL 10 categories always present. Now includes persistent error tracking. """ - # Run cleanup on every status check so stale errors are auto-resolved - # using the user-configured Suppression Duration (single source of truth). - try: - health_persistence.cleanup_old_errors() - except Exception: - pass - active_errors = health_persistence.get_active_errors() # No need to create persistent_issues dict here, it's implicitly handled by the checks @@ -828,20 +821,8 @@ class HealthMonitor: issues = [] storage_details = {} - # Check disk usage and mount status for important mounts. - # We detect actual mountpoints dynamically rather than hard-coding. - critical_mounts = set() - critical_mounts.add('/') - try: - for part in psutil.disk_partitions(all=False): - mp = part.mountpoint - # Include standard system mounts and PVE storage - if mp in ('/', '/var', '/tmp', '/boot', '/boot/efi') or \ - mp.startswith('/var/lib/vz') or mp.startswith('/mnt/'): - critical_mounts.add(mp) - except Exception: - pass - critical_mounts = sorted(critical_mounts) + # Check disk usage and mount status first for critical mounts + critical_mounts = ['/'] for mount_point in critical_mounts: try: @@ -876,32 +857,9 @@ class HealthMonitor: # Check filesystem usage only if not already flagged as critical if mount_point not in storage_details or storage_details[mount_point].get('status') == 'OK': fs_status = self._check_filesystem(mount_point) - error_key = f'disk_space_{mount_point}' if fs_status['status'] != 'OK': issues.append(f"{mount_point}: {fs_status['reason']}") storage_details[mount_point] = fs_status - # Record persistent error for notifications - usage = psutil.disk_usage(mount_point) - avail_gb = usage.free / (1024**3) - if avail_gb >= 1: - avail_str = f"{avail_gb:.1f} GiB" - else: - avail_str = f"{usage.free / (1024**2):.0f} MiB" - health_persistence.record_error( - error_key=error_key, - category='disk', - severity=fs_status['status'], - reason=f'{mount_point}: {fs_status["reason"]}', - details={ - 'mount': mount_point, - 'used': str(round(usage.percent, 1)), - 'available': avail_str, - 'dismissable': False, - } - ) - else: - # Space recovered -- clear any previous alert - health_persistence.clear_error(error_key) except Exception: pass # Silently skip if mountpoint check fails @@ -1094,67 +1052,16 @@ class HealthMonitor: return storages - def _resolve_ata_to_disk(self, ata_port: str) -> str: - """Resolve an ATA controller name (e.g. 'ata8') to a block device (e.g. 'sda'). - - Uses /sys/class/ata_port/ symlinks and /sys/block/ to find the mapping. - Falls back to parsing dmesg for 'ata8: SATA link up' -> 'sd 7:0:0:0: [sda]'. - """ - if not ata_port or not ata_port.startswith('ata'): - return ata_port - - port_num = ata_port.replace('ata', '') - - # Method 1: Walk /sys/class/ata_port/ -> host -> target -> block - try: - ata_path = f'/sys/class/ata_port/{ata_port}' - if os.path.exists(ata_path): - device_path = os.path.realpath(ata_path) - # Walk up to find the SCSI host, then find block devices - # Path: /sys/devices/.../ataX/hostY/targetY:0:0/Y:0:0:0/block/sdZ - for root, dirs, files in os.walk(os.path.dirname(device_path)): - if 'block' in dirs: - block_path = os.path.join(root, 'block') - devs = os.listdir(block_path) - if devs: - return devs[0] # e.g. 'sda' - except (OSError, IOError): - pass - - # Method 2: Parse dmesg for ATA link messages - try: - result = subprocess.run( - ['dmesg', '--notime'], - capture_output=True, text=True, timeout=2 - ) - if result.returncode == 0: - # Look for "ata8: SATA link up" followed by "sd X:0:0:0: [sda]" - lines = result.stdout.split('\n') - host_num = None - for line in lines: - m = re.search(rf'{ata_port}:\s+SATA link', line) - if m: - # ata port number maps to host(N-1) typically - host_num = int(port_num) - 1 - if host_num is not None: - m2 = re.search(rf'sd\s+{host_num}:\d+:\d+:\d+:\s+\[(\w+)\]', line) - if m2: - return m2.group(1) - except (OSError, subprocess.TimeoutExpired): - pass - - return ata_port # Return original if resolution fails - def _check_disks_optimized(self) -> Dict[str, Any]: """ - Disk I/O error check -- the SINGLE source of truth for disk errors. - - Reads dmesg for I/O/ATA/SCSI errors, counts per device, records in - health_persistence, and returns status for the health dashboard. - Resolves ATA controller names (ata8) to physical disks (sda). + Optimized disk check - always returns status. + Checks dmesg for I/O errors and SMART status. + NOTE: This function is now largely covered by _check_storage_optimized, + but kept for potential specific disk-level reporting if needed. + Currently, its primary function is to detect recent I/O errors. """ current_time = time.time() - disk_results = {} # Single dict for both WARNING and CRITICAL + disk_issues = {} try: # Check dmesg for I/O errors in the last 5 minutes @@ -1165,52 +1072,17 @@ class HealthMonitor: timeout=2 ) - # Collect a sample line per device for richer error messages - disk_samples = {} - if result.returncode == 0: for line in result.stdout.split('\n'): line_lower = line.lower() - # Detect various disk error formats - is_disk_error = any(kw in line_lower for kw in [ - 'i/o error', 'scsi error', 'medium error', - 'failed command:', 'exception emask', - ]) - ata_match = re.search(r'(ata\d+)[\.\d]*:.*(?:error|failed|exception)', line_lower) - if ata_match: - is_disk_error = True - - if is_disk_error: - # Extract device from multiple formats - raw_device = None - for dev_re in [ - r'dev\s+(sd[a-z]+)', # dev sdb - r'\[(sd[a-z]+)\]', # [sda] - r'/dev/(sd[a-z]+)', # /dev/sda - r'(nvme\d+n\d+)', # nvme0n1 - r'device\s+(sd[a-z]+\d*)', # device sda1 - r'(ata\d+)', # ata8 (ATA controller) - ]: - dm = re.search(dev_re, line) - if dm: - raw_device = dm.group(1) - break - - if raw_device: - # Resolve ATA port to physical disk name - if raw_device.startswith('ata'): - resolved = self._resolve_ata_to_disk(raw_device) - disk_name = resolved - else: - disk_name = raw_device.rstrip('0123456789') if raw_device.startswith('sd') else raw_device - + if any(keyword in line_lower for keyword in ['i/o error', 'ata error', 'scsi error', 'medium error']): + # Try to extract disk name + disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+)', line) + if disk_match: + disk_name = disk_match.group(1) self.io_error_history[disk_name].append(current_time) - if disk_name not in disk_samples: - # Clean the sample: strip dmesg timestamp prefix - clean = re.sub(r'^\[.*?\]\s*', '', line.strip()) - disk_samples[disk_name] = clean[:200] - # Clean old history and evaluate per-disk status + # Clean old history (keep errors from the last 5 minutes) for disk in list(self.io_error_history.keys()): self.io_error_history[disk] = [ t for t in self.io_error_history[disk] @@ -1218,67 +1090,57 @@ class HealthMonitor: ] error_count = len(self.io_error_history[disk]) - error_key = f'disk_{disk}' - sample = disk_samples.get(disk, '') - display = f'/dev/{disk}' if not disk.startswith('/') else disk + # Report based on recent error count if error_count >= 3: + error_key = f'disk_{disk}' severity = 'CRITICAL' - reason = f'{display}: {error_count} I/O errors in 5 min' - if sample: - reason += f'\n{sample}' + reason = f'{error_count} I/O errors in 5 minutes' health_persistence.record_error( error_key=error_key, category='disks', severity=severity, reason=reason, - details={'disk': disk, 'device': display, - 'error_count': error_count, - 'sample': sample, 'dismissable': False} + details={'disk': disk, 'error_count': error_count, 'dismissable': False} ) - disk_results[display] = { + + disk_details[disk] = { 'status': severity, 'reason': reason, - 'device': disk, - 'error_count': error_count, - 'dismissable': False, + 'dismissable': False } elif error_count >= 1: + error_key = f'disk_{disk}' severity = 'WARNING' - reason = f'{display}: {error_count} I/O error(s) in 5 min' - if sample: - reason += f'\n{sample}' + reason = f'{error_count} I/O error(s) in 5 minutes' - rec_result = health_persistence.record_error( + health_persistence.record_error( error_key=error_key, category='disks', severity=severity, reason=reason, - details={'disk': disk, 'device': display, - 'error_count': error_count, - 'sample': sample, 'dismissable': True} + details={'disk': disk, 'error_count': error_count, 'dismissable': True} ) - if not rec_result or rec_result.get('type') != 'skipped_acknowledged': - disk_results[display] = { - 'status': severity, - 'reason': reason, - 'device': disk, - 'error_count': error_count, - 'dismissable': True, - } + + disk_issues[f'/dev/{disk}'] = { + 'status': severity, + 'reason': reason, + 'dismissable': True + } else: + error_key = f'disk_{disk}' health_persistence.resolve_error(error_key, 'Disk errors cleared') - if not disk_results: + if not disk_issues: return {'status': 'OK'} - has_critical = any(d.get('status') == 'CRITICAL' for d in disk_results.values()) + has_critical = any(d.get('status') == 'CRITICAL' for d in disk_issues.values()) return { 'status': 'CRITICAL' if has_critical else 'WARNING', - 'reason': f"{len(disk_results)} disk(s) with recent errors", - 'details': disk_results + 'reason': f"{len(disk_issues)} disk(s) with recent errors", + 'details': disk_issues } except Exception as e: @@ -1489,51 +1351,12 @@ class HealthMonitor: except Exception: return {'status': 'UNKNOWN', 'reason': 'Ping command failed'} - def _is_vzdump_active(self) -> bool: - """Check if a vzdump (backup) job is currently running.""" - try: - with open('/var/log/pve/tasks/active', 'r') as f: - for line in f: - if ':vzdump:' in line: - return True - except (OSError, IOError): - pass - return False - - def _resolve_vm_name(self, vmid: str) -> str: - """Resolve VMID to guest name from PVE config files.""" - if not vmid: - return '' - for base in ['/etc/pve/qemu-server', '/etc/pve/lxc']: - conf = os.path.join(base, f'{vmid}.conf') - try: - with open(conf) as f: - for line in f: - if line.startswith('hostname:') or line.startswith('name:'): - return line.split(':', 1)[1].strip() - except (OSError, IOError): - continue - return '' - def _check_vms_cts_optimized(self) -> Dict[str, Any]: """ Optimized VM/CT check - detects qmp failures and startup errors from logs. Improved detection of container and VM errors from journalctl. """ try: - # First: auto-resolve any persisted VM/CT errors where the guest - # is now running. This clears stale "Failed to start" / QMP - # errors that are no longer relevant. - try: - active_vm_errors = health_persistence.get_active_errors('vms') - for err in active_vm_errors: - details = err.get('details') or {} - vmid = details.get('id', '') - if vmid: - health_persistence.check_vm_running(vmid) - except Exception: - pass - issues = [] vm_details = {} @@ -1544,28 +1367,20 @@ class HealthMonitor: timeout=3 ) - # Check if vzdump is running -- QMP timeouts during backup are normal - _vzdump_running = self._is_vzdump_active() - if result.returncode == 0: for line in result.stdout.split('\n'): line_lower = line.lower() vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower) if vm_qmp_match: - if _vzdump_running: - continue # Normal during backup vmid = vm_qmp_match.group(1) - vm_name = self._resolve_vm_name(vmid) - display = f"VM {vmid} ({vm_name})" if vm_name else f"VM {vmid}" key = f'vm_{vmid}' if key not in vm_details: - issues.append(f'{display}: QMP communication issue') + issues.append(f'VM {vmid}: Communication issue') vm_details[key] = { 'status': 'WARNING', - 'reason': f'{display}: QMP command failed or timed out.\n{line.strip()[:200]}', + 'reason': 'QMP command timeout', 'id': vmid, - 'vmname': vm_name, 'type': 'VM' } continue @@ -1586,15 +1401,11 @@ class HealthMonitor: else: reason = 'Container error' - ct_name = self._resolve_vm_name(ctid) - display = f"CT {ctid} ({ct_name})" if ct_name else f"CT {ctid}" - full_reason = f'{display}: {reason}\n{line.strip()[:200]}' - issues.append(f'{display}: {reason}') + issues.append(f'CT {ctid}: {reason}') vm_details[key] = { 'status': 'WARNING' if 'device' in reason.lower() else 'CRITICAL', - 'reason': full_reason, + 'reason': reason, 'id': ctid, - 'vmname': ct_name, 'type': 'CT' } continue @@ -1629,15 +1440,11 @@ class HealthMonitor: vmid = id_match.group(1) key = f'vmct_{vmid}' if key not in vm_details: - vm_name = self._resolve_vm_name(vmid) - display = f"VM/CT {vmid} ({vm_name})" if vm_name else f"VM/CT {vmid}" - full_reason = f'{display}: Failed to start\n{line.strip()[:200]}' - issues.append(f'{display}: Failed to start') + issues.append(f'VM/CT {vmid}: Failed to start') vm_details[key] = { 'status': 'CRITICAL', - 'reason': full_reason, + 'reason': 'Failed to start', 'id': vmid, - 'vmname': vm_name, 'type': 'VM/CT' } @@ -1697,38 +1504,31 @@ class HealthMonitor: timeout=3 ) - _vzdump_running = self._is_vzdump_active() - if result.returncode == 0: for line in result.stdout.split('\n'): line_lower = line.lower() - # VM QMP errors (skip during active backup -- normal behavior) + # VM QMP errors vm_qmp_match = re.search(r'vm\s+(\d+)\s+qmp\s+command.*(?:failed|unable|timeout)', line_lower) if vm_qmp_match: - if _vzdump_running: - continue # Normal during backup vmid = vm_qmp_match.group(1) - vm_name = self._resolve_vm_name(vmid) - display = f"VM {vmid} ({vm_name})" if vm_name else f"VM {vmid}" error_key = f'vm_{vmid}' if error_key not in vm_details: - rec_result = health_persistence.record_error( + # Record persistent error + health_persistence.record_error( error_key=error_key, category='vms', severity='WARNING', - reason=f'{display}: QMP command failed or timed out.\n{line.strip()[:200]}', - details={'id': vmid, 'vmname': vm_name, 'type': 'VM'} + reason='QMP command timeout', + details={'id': vmid, 'type': 'VM'} ) - if not rec_result or rec_result.get('type') != 'skipped_acknowledged': - issues.append(f'{display}: QMP communication issue') - vm_details[error_key] = { - 'status': 'WARNING', - 'reason': f'{display}: QMP command failed or timed out', - 'id': vmid, - 'vmname': vm_name, - 'type': 'VM' - } + issues.append(f'VM {vmid}: Communication issue') + vm_details[error_key] = { + 'status': 'WARNING', + 'reason': 'QMP command timeout', + 'id': vmid, + 'type': 'VM' + } continue # Container errors (including startup issues via vzstart) @@ -1748,21 +1548,20 @@ class HealthMonitor: reason = 'Startup error' # Record persistent error - rec_result = health_persistence.record_error( + health_persistence.record_error( error_key=error_key, category='vms', severity='WARNING', reason=reason, details={'id': ctid, 'type': 'CT'} ) - if not rec_result or rec_result.get('type') != 'skipped_acknowledged': - issues.append(f'CT {ctid}: {reason}') - vm_details[error_key] = { - 'status': 'WARNING', - 'reason': reason, - 'id': ctid, - 'type': 'CT' - } + issues.append(f'CT {ctid}: {reason}') + vm_details[error_key] = { + 'status': 'WARNING', + 'reason': reason, + 'id': ctid, + 'type': 'CT' + } # Generic failed to start for VMs and CTs if any(keyword in line_lower for keyword in ['failed to start', 'cannot start', 'activation failed', 'start error']): @@ -1787,28 +1586,22 @@ class HealthMonitor: vm_type = 'VM/CT' if error_key not in vm_details: - vm_name = self._resolve_vm_name(vmid_ctid) - display = f"{vm_type} {vmid_ctid}" - if vm_name: - display = f"{vm_type} {vmid_ctid} ({vm_name})" - reason = f'{display}: Failed to start\n{line.strip()[:200]}' + reason = 'Failed to start' # Record persistent error - rec_result = health_persistence.record_error( + health_persistence.record_error( error_key=error_key, category='vms', severity='CRITICAL', reason=reason, - details={'id': vmid_ctid, 'vmname': vm_name, 'type': vm_type} + details={'id': vmid_ctid, 'type': vm_type} ) - if not rec_result or rec_result.get('type') != 'skipped_acknowledged': - issues.append(f'{display}: Failed to start') - vm_details[error_key] = { - 'status': 'CRITICAL', - 'reason': reason, - 'id': vmid_ctid, - 'vmname': vm_name, - 'type': vm_type - } + issues.append(f'{vm_type} {vmid_ctid}: {reason}') + vm_details[error_key] = { + 'status': 'CRITICAL', + 'reason': reason, + 'id': vmid_ctid, + 'type': vm_type + } # Build checks dict from vm_details checks = {} @@ -1899,23 +1692,16 @@ class HealthMonitor: if failed_services: reason = f'Services inactive: {", ".join(failed_services)}' - # Record each failed service in persistence, respecting dismiss - active_failed = [] + # Record each failed service in persistence for svc in failed_services: error_key = f'pve_service_{svc}' - rec_result = health_persistence.record_error( + health_persistence.record_error( error_key=error_key, category='pve_services', severity='CRITICAL', reason=f'PVE service {svc} is {service_details.get(svc, "inactive")}', details={'service': svc, 'state': service_details.get(svc, 'inactive')} ) - if rec_result and rec_result.get('type') == 'skipped_acknowledged': - # Mark as dismissed in checks for frontend - if svc in checks: - checks[svc]['dismissed'] = True - else: - active_failed.append(svc) # Auto-clear services that recovered for svc in services_to_check: @@ -1924,21 +1710,10 @@ class HealthMonitor: if health_persistence.is_error_active(error_key): health_persistence.clear_error(error_key) - # If all failed services are dismissed, return OK - if not active_failed: - return { - 'status': 'OK', - 'reason': None, - 'failed': [], - 'is_cluster': is_cluster, - 'services_checked': len(services_to_check), - 'checks': checks - } - return { 'status': 'CRITICAL', - 'reason': f'Services inactive: {", ".join(active_failed)}', - 'failed': active_failed, + 'reason': reason, + 'failed': failed_services, 'is_cluster': is_cluster, 'services_checked': len(services_to_check), 'checks': checks @@ -2096,8 +1871,7 @@ class HealthMonitor: self.persistent_log_patterns[pattern] = { 'count': 1, 'first_seen': current_time, - 'last_seen': current_time, - 'sample': line.strip()[:200], # Original line for display + 'last_seen': current_time } for line in previous_lines: @@ -2129,18 +1903,6 @@ class HealthMonitor: if recent_count >= 5 and recent_count >= prev_count * 4: spike_errors[pattern] = recent_count - # Helper: get human-readable samples from normalized patterns - def _get_samples(error_dict, max_items=3): - """Return list of readable sample lines for error patterns.""" - samples = [] - for pattern in list(error_dict.keys())[:max_items]: - pdata = self.persistent_log_patterns.get(pattern, {}) - sample = pdata.get('sample', pattern) - # Trim timestamp prefix if present (e.g. "Feb 27 16:03:35 host ") - clean = re.sub(r'^[A-Z][a-z]{2}\s+\d+\s+[\d:]+\s+\S+\s+', '', sample) - samples.append(clean[:120]) - return samples - persistent_errors = {} for pattern, data in self.persistent_log_patterns.items(): time_span = current_time - data['first_seen'] @@ -2151,16 +1913,12 @@ class HealthMonitor: pattern_hash = hashlib.md5(pattern.encode()).hexdigest()[:8] error_key = f'log_persistent_{pattern_hash}' if not health_persistence.is_error_active(error_key, category='logs'): - # Use the original sample line for the notification, - # not the normalized pattern (which has IDs replaced). - sample = data.get('sample', pattern) health_persistence.record_error( error_key=error_key, category='logs', severity='WARNING', - reason=f'Recurring error ({data["count"]}x): {sample[:150]}', - details={'pattern': pattern, 'sample': sample, - 'dismissable': True, 'occurrences': data['count']} + reason=f'Persistent error pattern detected: {pattern[:80]}', + details={'pattern': pattern, 'dismissable': True, 'occurrences': data['count']} ) patterns_to_remove = [ @@ -2182,33 +1940,26 @@ class HealthMonitor: reason = f'Critical error detected: {representative_error[:100]}' elif cascade_count > 0: status = 'WARNING' - samples = _get_samples(cascading_errors, 3) - reason = f'Error cascade ({cascade_count} patterns repeating):\n' + '\n'.join(f' - {s}' for s in samples) + reason = f'Error cascade detected: {cascade_count} pattern(s) repeating ≥15 times in 3min' elif spike_count > 0: status = 'WARNING' - samples = _get_samples(spike_errors, 3) - reason = f'Error spike ({spike_count} patterns with 4x increase):\n' + '\n'.join(f' - {s}' for s in samples) + reason = f'Error spike detected: {spike_count} pattern(s) increased 4x' elif persistent_count > 0: status = 'WARNING' - samples = _get_samples(persistent_errors, 3) - reason = f'Persistent errors ({persistent_count} patterns over 15+ min):\n' + '\n'.join(f' - {s}' for s in samples) + reason = f'Persistent errors: {persistent_count} pattern(s) recurring over 15+ minutes' else: # No significant issues found status = 'OK' reason = None # Record/clear persistent errors for each log sub-check so Dismiss works - cascade_samples = _get_samples(cascading_errors, 2) if cascade_count else [] - spike_samples = _get_samples(spike_errors, 2) if spike_count else [] - persist_samples = _get_samples(persistent_errors, 2) if persistent_count else [] - log_sub_checks = { 'log_error_cascade': {'active': cascade_count > 0, 'severity': 'WARNING', - 'reason': f'{cascade_count} pattern(s) repeating >=15 times:\n' + '\n'.join(f' - {s}' for s in cascade_samples) if cascade_count else ''}, + 'reason': f'{cascade_count} pattern(s) repeating >=15 times'}, 'log_error_spike': {'active': spike_count > 0, 'severity': 'WARNING', - 'reason': f'{spike_count} pattern(s) with 4x increase:\n' + '\n'.join(f' - {s}' for s in spike_samples) if spike_count else ''}, + 'reason': f'{spike_count} pattern(s) with 4x increase'}, 'log_persistent_errors': {'active': persistent_count > 0, 'severity': 'WARNING', - 'reason': f'{persistent_count} recurring pattern(s) over 15+ min:\n' + '\n'.join(f' - {s}' for s in persist_samples) if persistent_count else ''}, + 'reason': f'{persistent_count} recurring pattern(s) over 15+ min'}, 'log_critical_errors': {'active': unique_critical_count > 0, 'severity': 'CRITICAL', 'reason': f'{unique_critical_count} critical error(s) found', 'dismissable': False}, } @@ -2584,7 +2335,20 @@ class HealthMonitor: msg = f'{total_banned} IP(s) currently banned by Fail2Ban (jails: {jails_str})' result['status'] = 'WARNING' result['detail'] = msg - # Persistence handled by _check_security caller via security_fail2ban key + + # Record in persistence (dismissable) + health_persistence.record_error( + error_key='fail2ban', + category='security', + severity='WARNING', + reason=msg, + details={ + 'banned_count': total_banned, + 'jails': jails_with_bans, + 'banned_ips': all_banned_ips[:5], + 'dismissable': True + } + ) else: result['detail'] = f'Fail2Ban active ({len(jails)} jail(s), no current bans)' # Auto-resolve if previously banned IPs are now gone @@ -2692,60 +2456,14 @@ class HealthMonitor: except Exception: pass - # Persist errors and respect dismiss for each sub-check - dismissed_keys = set() - security_sub_checks = { - 'security_login_attempts': checks.get('login_attempts', {}), - 'security_certificates': checks.get('certificates', {}), - 'security_uptime': checks.get('uptime', {}), - 'security_fail2ban': checks.get('fail2ban', {}), - } - - for err_key, check_info in security_sub_checks.items(): - check_status = check_info.get('status', 'OK') - if check_status not in ('OK', 'INFO'): - is_dismissable = check_info.get('dismissable', True) - rec_result = health_persistence.record_error( - error_key=err_key, - category='security', - severity=check_status, - reason=check_info.get('detail', ''), - details={'dismissable': is_dismissable} - ) - if rec_result and rec_result.get('type') == 'skipped_acknowledged': - dismissed_keys.add(err_key) - elif health_persistence.is_error_active(err_key): - health_persistence.clear_error(err_key) - - # Rebuild issues excluding dismissed sub-checks - key_to_check = { - 'security_login_attempts': 'login_attempts', - 'security_certificates': 'certificates', - 'security_uptime': 'uptime', - 'security_fail2ban': 'fail2ban', - } - active_issues = [] - for err_key, check_name in key_to_check.items(): - if err_key in dismissed_keys: - # Mark as dismissed in checks for the frontend - if check_name in checks: - checks[check_name]['dismissed'] = True - continue - check_info = checks.get(check_name, {}) - if check_info.get('status', 'OK') not in ('OK', 'INFO'): - active_issues.append(check_info.get('detail', '')) - - # Determine overall security status from non-dismissed issues only - if active_issues: - has_critical = any( - c.get('status') == 'CRITICAL' - for k, c in checks.items() - if f'security_{k}' not in dismissed_keys - ) + # Determine overall security status + if issues: + # Check if any sub-check is CRITICAL + has_critical = any(c.get('status') == 'CRITICAL' for c in checks.values()) overall_status = 'CRITICAL' if has_critical else 'WARNING' return { 'status': overall_status, - 'reason': '; '.join(active_issues[:2]), + 'reason': '; '.join(issues[:2]), 'checks': checks } diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index fede9b53..377f71da 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -25,8 +25,12 @@ from pathlib import Path class HealthPersistence: """Manages persistent health error tracking""" - # Default suppression duration when no user setting exists for a category. - # Users override per-category via the Suppression Duration settings UI. + # Error retention periods (seconds) + VM_ERROR_RETENTION = 48 * 3600 # 48 hours + LOG_ERROR_RETENTION = 24 * 3600 # 24 hours + DISK_ERROR_RETENTION = 48 * 3600 # 48 hours + + # Default suppression: 24 hours (user can change per-category in settings) DEFAULT_SUPPRESSION_HOURS = 24 # Mapping from error categories to settings keys @@ -110,31 +114,6 @@ class HealthPersistence: ) ''') - # Notification history table (records all sent notifications) - cursor.execute(''' - CREATE TABLE IF NOT EXISTS notification_history ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - event_type TEXT NOT NULL, - channel TEXT NOT NULL, - title TEXT, - message TEXT, - severity TEXT, - sent_at TEXT NOT NULL, - success INTEGER DEFAULT 1, - error_message TEXT, - source TEXT DEFAULT 'server' - ) - ''') - - # Notification cooldown persistence (survives restarts) - cursor.execute(''' - CREATE TABLE IF NOT EXISTS notification_last_sent ( - fingerprint TEXT PRIMARY KEY, - last_sent_ts INTEGER NOT NULL, - count INTEGER DEFAULT 1 - ) - ''') - # Migration: add suppression_hours column to errors if not present cursor.execute("PRAGMA table_info(errors)") columns = [col[1] for col in cursor.fetchall()] @@ -146,9 +125,6 @@ class HealthPersistence: cursor.execute('CREATE INDEX IF NOT EXISTS idx_category ON errors(category)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_resolved ON errors(resolved_at)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_events_error ON events(error_key)') - cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_sent_at ON notification_history(sent_at)') - cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_severity ON notification_history(severity)') - cursor.execute('CREATE INDEX IF NOT EXISTS idx_nls_ts ON notification_last_sent(last_sent_ts)') conn.commit() conn.close() @@ -492,58 +468,32 @@ class HealthPersistence: cursor = conn.cursor() now = datetime.now() - now_iso = now.isoformat() # Delete resolved errors older than 7 days cutoff_resolved = (now - timedelta(days=7)).isoformat() cursor.execute('DELETE FROM errors WHERE resolved_at < ?', (cutoff_resolved,)) - # ── Auto-resolve stale errors using Suppression Duration settings ── - # Read per-category suppression hours from user_settings. - # If the user hasn't configured a value, use DEFAULT_SUPPRESSION_HOURS. - # This is the SINGLE source of truth for auto-resolution timing. - user_settings = {} - try: - cursor.execute( - 'SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?', - ('suppress_%',) - ) - for row in cursor.fetchall(): - user_settings[row[0]] = row[1] - except Exception: - pass - - for category, setting_key in self.CATEGORY_SETTING_MAP.items(): - stored = user_settings.get(setting_key) - try: - hours = int(stored) if stored else self.DEFAULT_SUPPRESSION_HOURS - except (ValueError, TypeError): - hours = self.DEFAULT_SUPPRESSION_HOURS - - # -1 means permanently suppressed -- skip auto-resolve - if hours < 0: - continue - - cutoff = (now - timedelta(hours=hours)).isoformat() - cursor.execute(''' - UPDATE errors - SET resolved_at = ? - WHERE category = ? - AND resolved_at IS NULL - AND last_seen < ? - AND acknowledged = 0 - ''', (now_iso, category, cutoff)) - - # Catch-all: auto-resolve any error from an unmapped category - # whose last_seen exceeds DEFAULT_SUPPRESSION_HOURS. - fallback_cutoff = (now - timedelta(hours=self.DEFAULT_SUPPRESSION_HOURS)).isoformat() + # Auto-resolve VM/CT errors older than 48h + cutoff_vm = (now - timedelta(seconds=self.VM_ERROR_RETENTION)).isoformat() cursor.execute(''' - UPDATE errors + UPDATE errors SET resolved_at = ? - WHERE resolved_at IS NULL + WHERE category = 'vms' + AND resolved_at IS NULL + AND first_seen < ? AND acknowledged = 0 - AND last_seen < ? - ''', (now_iso, fallback_cutoff)) + ''', (now.isoformat(), cutoff_vm)) + + # Auto-resolve log errors older than 24h + cutoff_logs = (now - timedelta(seconds=self.LOG_ERROR_RETENTION)).isoformat() + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE category = 'logs' + AND resolved_at IS NULL + AND first_seen < ? + AND acknowledged = 0 + ''', (now.isoformat(), cutoff_logs)) # Delete old events (>30 days) cutoff_events = (now - timedelta(days=30)).isoformat() diff --git a/AppImage/scripts/notification_channels.py b/AppImage/scripts/notification_channels.py deleted file mode 100644 index 9cb6255f..00000000 --- a/AppImage/scripts/notification_channels.py +++ /dev/null @@ -1,579 +0,0 @@ -""" -ProxMenux Notification Channels -Provides transport adapters for Telegram, Gotify, and Discord. - -Each channel implements send() and test() with: -- Retry with exponential backoff (3 attempts) -- Request timeout of 10s -- Rate limiting (max 30 msg/min per channel) - -Author: MacRimi -""" - -import json -import time -import urllib.request -import urllib.error -import urllib.parse -from abc import ABC, abstractmethod -from collections import deque -from typing import Tuple, Optional, Dict, Any - - -# ─── Rate Limiter ──────────────────────────────────────────────── - -class RateLimiter: - """Token-bucket rate limiter: max N messages per window.""" - - def __init__(self, max_calls: int = 30, window_seconds: int = 60): - self.max_calls = max_calls - self.window = window_seconds - self._timestamps: deque = deque() - - def allow(self) -> bool: - now = time.monotonic() - while self._timestamps and now - self._timestamps[0] > self.window: - self._timestamps.popleft() - if len(self._timestamps) >= self.max_calls: - return False - self._timestamps.append(now) - return True - - def wait_time(self) -> float: - if not self._timestamps: - return 0.0 - return max(0.0, self.window - (time.monotonic() - self._timestamps[0])) - - -# ─── Base Channel ──────────────────────────────────────────────── - -class NotificationChannel(ABC): - """Abstract base for all notification channels.""" - - MAX_RETRIES = 3 - RETRY_DELAYS = [2, 4, 8] # exponential backoff seconds - REQUEST_TIMEOUT = 10 - - def __init__(self): - self._rate_limiter = RateLimiter(max_calls=30, window_seconds=60) - - @abstractmethod - def send(self, title: str, message: str, severity: str = 'INFO', - data: Optional[Dict] = None) -> Dict[str, Any]: - """Send a notification. Returns {success, error, channel}.""" - pass - - @abstractmethod - def test(self) -> Tuple[bool, str]: - """Send a test message. Returns (success, error_message).""" - pass - - @abstractmethod - def validate_config(self) -> Tuple[bool, str]: - """Check if config is valid without sending. Returns (valid, error).""" - pass - - def _http_request(self, url: str, data: bytes, headers: Dict[str, str], - method: str = 'POST') -> Tuple[int, str]: - """Execute HTTP request with timeout. Returns (status_code, body).""" - req = urllib.request.Request(url, data=data, headers=headers, method=method) - try: - with urllib.request.urlopen(req, timeout=self.REQUEST_TIMEOUT) as resp: - body = resp.read().decode('utf-8', errors='replace') - return resp.status, body - except urllib.error.HTTPError as e: - body = e.read().decode('utf-8', errors='replace') if e.fp else str(e) - return e.code, body - except urllib.error.URLError as e: - return 0, str(e.reason) - except Exception as e: - return 0, str(e) - - def _send_with_retry(self, send_fn) -> Dict[str, Any]: - """Wrap a send function with rate limiting and retry logic.""" - if not self._rate_limiter.allow(): - wait = self._rate_limiter.wait_time() - return { - 'success': False, - 'error': f'Rate limited. Retry in {wait:.0f}s', - 'rate_limited': True - } - - last_error = '' - for attempt in range(self.MAX_RETRIES): - try: - status, body = send_fn() - if 200 <= status < 300: - return {'success': True, 'error': None} - last_error = f'HTTP {status}: {body[:200]}' - except Exception as e: - last_error = str(e) - - if attempt < self.MAX_RETRIES - 1: - time.sleep(self.RETRY_DELAYS[attempt]) - - return {'success': False, 'error': last_error} - - -# ─── Telegram ──────────────────────────────────────────────────── - -class TelegramChannel(NotificationChannel): - """Telegram Bot API channel using HTML parse mode.""" - - API_BASE = 'https://api.telegram.org/bot{token}/sendMessage' - MAX_LENGTH = 4096 - - SEVERITY_ICONS = { - 'CRITICAL': '\U0001F534', # red circle - 'WARNING': '\U0001F7E1', # yellow circle - 'INFO': '\U0001F535', # blue circle - 'OK': '\U0001F7E2', # green circle - 'UNKNOWN': '\u26AA', # white circle - } - - def __init__(self, bot_token: str, chat_id: str): - super().__init__() - token = bot_token.strip() - # Strip 'bot' prefix if user included it (API_BASE already adds it) - if token.lower().startswith('bot') and ':' in token[3:]: - token = token[3:] - self.bot_token = token - self.chat_id = chat_id.strip() - - def validate_config(self) -> Tuple[bool, str]: - if not self.bot_token: - return False, 'Bot token is required' - if not self.chat_id: - return False, 'Chat ID is required' - if ':' not in self.bot_token: - return False, 'Invalid bot token format (expected BOT_ID:TOKEN)' - return True, '' - - def send(self, title: str, message: str, severity: str = 'INFO', - data: Optional[Dict] = None) -> Dict[str, Any]: - icon = self.SEVERITY_ICONS.get(severity, self.SEVERITY_ICONS['INFO']) - html_msg = f"{icon} {self._escape_html(title)}\n\n{self._escape_html(message)}" - - # Split long messages - chunks = self._split_message(html_msg) - result = {'success': True, 'error': None, 'channel': 'telegram'} - - for chunk in chunks: - res = self._send_with_retry(lambda c=chunk: self._post_message(c)) - if not res['success']: - result = {**res, 'channel': 'telegram'} - break - - return result - - def test(self) -> Tuple[bool, str]: - valid, err = self.validate_config() - if not valid: - return False, err - - result = self.send( - 'ProxMenux Test', - 'Notification service is working correctly.\nThis is a test message from ProxMenux Monitor.', - 'INFO' - ) - return result['success'], result.get('error', '') - - def _post_message(self, text: str) -> Tuple[int, str]: - url = self.API_BASE.format(token=self.bot_token) - payload = json.dumps({ - 'chat_id': self.chat_id, - 'text': text, - 'parse_mode': 'HTML', - 'disable_web_page_preview': True, - }).encode('utf-8') - - return self._http_request(url, payload, {'Content-Type': 'application/json'}) - - def _split_message(self, text: str) -> list: - if len(text) <= self.MAX_LENGTH: - return [text] - chunks = [] - while text: - if len(text) <= self.MAX_LENGTH: - chunks.append(text) - break - split_at = text.rfind('\n', 0, self.MAX_LENGTH) - if split_at == -1: - split_at = self.MAX_LENGTH - chunks.append(text[:split_at]) - text = text[split_at:].lstrip('\n') - return chunks - - @staticmethod - def _escape_html(text: str) -> str: - return (text - .replace('&', '&') - .replace('<', '<') - .replace('>', '>')) - - -# ─── Gotify ────────────────────────────────────────────────────── - -class GotifyChannel(NotificationChannel): - """Gotify push notification channel with priority mapping.""" - - PRIORITY_MAP = { - 'OK': 1, - 'INFO': 2, - 'UNKNOWN': 3, - 'WARNING': 5, - 'CRITICAL': 10, - } - - def __init__(self, server_url: str, app_token: str): - super().__init__() - self.server_url = server_url.rstrip('/').strip() - self.app_token = app_token.strip() - - def validate_config(self) -> Tuple[bool, str]: - if not self.server_url: - return False, 'Server URL is required' - if not self.app_token: - return False, 'Application token is required' - if not self.server_url.startswith(('http://', 'https://')): - return False, 'Server URL must start with http:// or https://' - return True, '' - - def send(self, title: str, message: str, severity: str = 'INFO', - data: Optional[Dict] = None) -> Dict[str, Any]: - priority = self.PRIORITY_MAP.get(severity, 2) - - result = self._send_with_retry( - lambda: self._post_message(title, message, priority) - ) - result['channel'] = 'gotify' - return result - - def test(self) -> Tuple[bool, str]: - valid, err = self.validate_config() - if not valid: - return False, err - - result = self.send( - 'ProxMenux Test', - 'Notification service is working correctly.\nThis is a test message from ProxMenux Monitor.', - 'INFO' - ) - return result['success'], result.get('error', '') - - def _post_message(self, title: str, message: str, priority: int) -> Tuple[int, str]: - url = f"{self.server_url}/message?token={self.app_token}" - payload = json.dumps({ - 'title': title, - 'message': message, - 'priority': priority, - 'extras': { - 'client::display': {'contentType': 'text/markdown'} - } - }).encode('utf-8') - - return self._http_request(url, payload, {'Content-Type': 'application/json'}) - - -# ─── Discord ───────────────────────────────────────────────────── - -class DiscordChannel(NotificationChannel): - """Discord webhook channel with color-coded embeds.""" - - MAX_EMBED_DESC = 2048 - - SEVERITY_COLORS = { - 'CRITICAL': 0xED4245, # red - 'WARNING': 0xFEE75C, # yellow - 'INFO': 0x5865F2, # blurple - 'OK': 0x57F287, # green - 'UNKNOWN': 0x99AAB5, # grey - } - - def __init__(self, webhook_url: str): - super().__init__() - self.webhook_url = webhook_url.strip() - - def validate_config(self) -> Tuple[bool, str]: - if not self.webhook_url: - return False, 'Webhook URL is required' - if 'discord.com/api/webhooks/' not in self.webhook_url: - return False, 'Invalid Discord webhook URL' - return True, '' - - def send(self, title: str, message: str, severity: str = 'INFO', - data: Optional[Dict] = None) -> Dict[str, Any]: - color = self.SEVERITY_COLORS.get(severity, 0x5865F2) - - desc = message[:self.MAX_EMBED_DESC] if len(message) > self.MAX_EMBED_DESC else message - - embed = { - 'title': title, - 'description': desc, - 'color': color, - 'footer': {'text': 'ProxMenux Monitor'}, - 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()), - } - - # Use structured fields from render_template if available - rendered_fields = (data or {}).get('_rendered_fields', []) - if rendered_fields: - embed['fields'] = [ - {'name': name, 'value': val[:1024], 'inline': True} - for name, val in rendered_fields[:25] # Discord limit: 25 fields - ] - elif data: - fields = [] - if data.get('category'): - fields.append({'name': 'Category', 'value': data['category'], 'inline': True}) - if data.get('hostname'): - fields.append({'name': 'Host', 'value': data['hostname'], 'inline': True}) - if data.get('severity'): - fields.append({'name': 'Severity', 'value': data['severity'], 'inline': True}) - if fields: - embed['fields'] = fields - - result = self._send_with_retry( - lambda: self._post_webhook(embed) - ) - result['channel'] = 'discord' - return result - - def test(self) -> Tuple[bool, str]: - valid, err = self.validate_config() - if not valid: - return False, err - - result = self.send( - 'ProxMenux Test', - 'Notification service is working correctly.\nThis is a test message from ProxMenux Monitor.', - 'INFO' - ) - return result['success'], result.get('error', '') - - def _post_webhook(self, embed: Dict) -> Tuple[int, str]: - payload = json.dumps({ - 'username': 'ProxMenux', - 'embeds': [embed] - }).encode('utf-8') - - return self._http_request( - self.webhook_url, payload, {'Content-Type': 'application/json'} - ) - - -# ─── Email Channel ────────────────────────────────────────────── - -class EmailChannel(NotificationChannel): - """Email notification channel using SMTP (smtplib) or sendmail fallback. - - Config keys: - host, port, username, password, tls_mode (none|starttls|ssl), - from_address, to_addresses (comma-separated), subject_prefix, timeout - """ - - def __init__(self, config: Dict[str, str]): - super().__init__() - self.host = config.get('host', '') - self.port = int(config.get('port', 587) or 587) - self.username = config.get('username', '') - self.password = config.get('password', '') - self.tls_mode = config.get('tls_mode', 'starttls') # none | starttls | ssl - self.from_address = config.get('from_address', '') - self.to_addresses = self._parse_recipients(config.get('to_addresses', '')) - self.subject_prefix = config.get('subject_prefix', '[ProxMenux]') - self.timeout = int(config.get('timeout', 10) or 10) - - @staticmethod - def _parse_recipients(raw) -> list: - if isinstance(raw, list): - return [a.strip() for a in raw if a.strip()] - return [addr.strip() for addr in str(raw).split(',') if addr.strip()] - - def validate_config(self) -> Tuple[bool, str]: - if not self.to_addresses: - return False, 'No recipients configured' - if not self.from_address: - return False, 'No from address configured' - # Must have SMTP host OR local sendmail available - if not self.host: - import os - if not os.path.exists('/usr/sbin/sendmail'): - return False, 'No SMTP host configured and /usr/sbin/sendmail not found' - return True, '' - - def send(self, title: str, message: str, severity: str = 'INFO', - data: Optional[Dict] = None) -> Dict[str, Any]: - subject = f"{self.subject_prefix} [{severity}] {title}" - - def _do_send(): - if self.host: - return self._send_smtp(subject, message, severity) - else: - return self._send_sendmail(subject, message, severity) - - return self._send_with_retry(_do_send) - - def _send_smtp(self, subject: str, body: str, severity: str) -> Tuple[int, str]: - import smtplib - from email.message import EmailMessage - - msg = EmailMessage() - msg['Subject'] = subject - msg['From'] = self.from_address - msg['To'] = ', '.join(self.to_addresses) - msg.set_content(body) - - # Add HTML alternative - html_body = self._format_html(subject, body, severity) - if html_body: - msg.add_alternative(html_body, subtype='html') - - try: - if self.tls_mode == 'ssl': - server = smtplib.SMTP_SSL(self.host, self.port, timeout=self.timeout) - else: - server = smtplib.SMTP(self.host, self.port, timeout=self.timeout) - if self.tls_mode == 'starttls': - server.starttls() - - if self.username and self.password: - server.login(self.username, self.password) - - server.send_message(msg) - server.quit() - return 200, 'OK' - except smtplib.SMTPAuthenticationError as e: - return 0, f'SMTP authentication failed: {e}' - except smtplib.SMTPConnectError as e: - return 0, f'SMTP connection failed: {e}' - except smtplib.SMTPException as e: - return 0, f'SMTP error: {e}' - except (OSError, TimeoutError) as e: - return 0, f'Connection error: {e}' - - def _send_sendmail(self, subject: str, body: str, severity: str) -> Tuple[int, str]: - import os - import subprocess - from email.message import EmailMessage - - sendmail = '/usr/sbin/sendmail' - if not os.path.exists(sendmail): - return 0, 'sendmail not found at /usr/sbin/sendmail' - - msg = EmailMessage() - msg['Subject'] = subject - msg['From'] = self.from_address or 'proxmenux@localhost' - msg['To'] = ', '.join(self.to_addresses) - msg.set_content(body) - - try: - proc = subprocess.run( - [sendmail, '-t', '-oi'], - input=msg.as_string(), capture_output=True, text=True, timeout=30 - ) - if proc.returncode == 0: - return 200, 'OK' - return 0, f'sendmail failed (rc={proc.returncode}): {proc.stderr[:200]}' - except subprocess.TimeoutExpired: - return 0, 'sendmail timed out after 30s' - except Exception as e: - return 0, f'sendmail error: {e}' - - @staticmethod - def _format_html(subject: str, body: str, severity: str) -> str: - """Create professional HTML email.""" - import html as html_mod - - severity_colors = {'CRITICAL': '#dc2626', 'WARNING': '#f59e0b', 'INFO': '#3b82f6'} - color = severity_colors.get(severity, '#6b7280') - - body_html = ''.join( - f'

{html_mod.escape(line)}

' - for line in body.split('\n') if line.strip() - ) - - return f''' - -
-
-

ProxMenux Monitor

-

{html_mod.escape(severity)} Alert

-
-
-

{html_mod.escape(subject)}

- {body_html} -
-
-

Sent by ProxMenux Notification Service

-
-
-''' - - def test(self) -> Tuple[bool, str]: - result = self.send( - 'ProxMenux Test Notification', - 'This is a test notification from ProxMenux Monitor.\n' - 'If you received this, your email channel is working correctly.', - 'INFO' - ) - return result.get('success', False), result.get('error', '') - - -# ─── Channel Factory ───────────────────────────────────────────── - -CHANNEL_TYPES = { - 'telegram': { - 'name': 'Telegram', - 'config_keys': ['bot_token', 'chat_id'], - 'class': TelegramChannel, - }, - 'gotify': { - 'name': 'Gotify', - 'config_keys': ['url', 'token'], - 'class': GotifyChannel, - }, - 'discord': { - 'name': 'Discord', - 'config_keys': ['webhook_url'], - 'class': DiscordChannel, - }, - 'email': { - 'name': 'Email (SMTP)', - 'config_keys': ['host', 'port', 'username', 'password', 'tls_mode', - 'from_address', 'to_addresses', 'subject_prefix'], - 'class': EmailChannel, - }, -} - - -def create_channel(channel_type: str, config: Dict[str, str]) -> Optional[NotificationChannel]: - """Create a channel instance from type name and config dict. - - Args: - channel_type: 'telegram', 'gotify', or 'discord' - config: Dict with channel-specific keys (see CHANNEL_TYPES) - - Returns: - Channel instance or None if creation fails - """ - try: - if channel_type == 'telegram': - return TelegramChannel( - bot_token=config.get('bot_token', ''), - chat_id=config.get('chat_id', '') - ) - elif channel_type == 'gotify': - return GotifyChannel( - server_url=config.get('url', ''), - app_token=config.get('token', '') - ) - elif channel_type == 'discord': - return DiscordChannel( - webhook_url=config.get('webhook_url', '') - ) - elif channel_type == 'email': - return EmailChannel(config) - except Exception as e: - print(f"[NotificationChannels] Failed to create {channel_type}: {e}") - return None diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py deleted file mode 100644 index 8a47d428..00000000 --- a/AppImage/scripts/notification_events.py +++ /dev/null @@ -1,1301 +0,0 @@ -""" -ProxMenux Notification Event Watchers -Detects Proxmox events from journald, PVE task log, and health monitor. - -Architecture: -- JournalWatcher: Real-time stream of journald for critical events -- TaskWatcher: Real-time tail of /var/log/pve/tasks/index for VM/CT/backup events -- PollingCollector: Periodic poll of health_persistence pending notifications - -All watchers put events into a shared Queue consumed by NotificationManager. - -Author: MacRimi -""" - -import os -import re -import json -import time -import hashlib -import socket -import sqlite3 -import subprocess -import threading -from queue import Queue -from typing import Optional, Dict, Any, Tuple -from pathlib import Path - - -# ─── Event Object ───────────────────────────────────────────────── - -class NotificationEvent: - """Represents a detected event ready for notification dispatch. - - Fields: - event_type: Taxonomy key (e.g. 'vm_fail', 'auth_fail', 'split_brain') - severity: INFO | WARNING | CRITICAL - data: Payload dict with context (hostname, vmid, reason, etc.) - source: Origin: journal | tasks | health | proxmox_hook | cli | api | polling - entity: What is affected: node | vm | ct | storage | disk | network | cluster | user - entity_id: Specific identifier (vmid, IP, device, pool, interface, etc.) - raw: Original payload (webhook JSON or log line), optional - fingerprint: Stable dedup key: hostname:entity:entity_id:event_type - event_id: Short hash of fingerprint for correlation - ts_epoch: time.time() at creation - ts_monotonic: time.monotonic() at creation (drift-safe for cooldown) - """ - - __slots__ = ( - 'event_type', 'severity', 'data', 'timestamp', 'source', - 'entity', 'entity_id', 'raw', - 'fingerprint', 'event_id', 'ts_epoch', 'ts_monotonic', - ) - - def __init__(self, event_type: str, severity: str = 'INFO', - data: Optional[Dict[str, Any]] = None, - source: str = 'watcher', - entity: str = 'node', entity_id: str = '', - raw: Any = None): - self.event_type = event_type - self.severity = severity - self.data = data or {} - self.source = source - self.entity = entity - self.entity_id = entity_id - self.raw = raw - self.ts_epoch = time.time() - self.ts_monotonic = time.monotonic() - self.timestamp = self.ts_epoch # backward compat - - # Build fingerprint for dedup/cooldown - hostname = self.data.get('hostname', _hostname()) - if entity_id: - fp_base = f"{hostname}:{entity}:{entity_id}:{event_type}" - else: - # When entity_id is empty, include a hash of title/body for uniqueness - reason = self.data.get('reason', self.data.get('title', '')) - stable_extra = hashlib.md5(reason.encode(errors='replace')).hexdigest()[:8] if reason else '' - fp_base = f"{hostname}:{entity}:{event_type}:{stable_extra}" - self.fingerprint = fp_base - self.event_id = hashlib.md5(fp_base.encode()).hexdigest()[:12] - - def __repr__(self): - return f"NotificationEvent({self.event_type}, {self.severity}, fp={self.fingerprint[:40]})" - - -def _hostname() -> str: - try: - return socket.gethostname().split('.')[0] - except Exception: - return 'proxmox' - - -# ─── Journal Watcher (Real-time) ───────────────────────────────── - -class JournalWatcher: - """Watches journald in real-time for critical system events. - - Uses 'journalctl -f -o json' subprocess to stream entries. - Detects: auth failures, kernel panics, OOM, service crashes, - disk I/O errors, split-brain, node disconnect, system shutdown, - fail2ban bans, firewall blocks, permission changes. - """ - - def __init__(self, event_queue: Queue): - self._queue = event_queue - self._running = False - self._thread: Optional[threading.Thread] = None - self._process: Optional[subprocess.Popen] = None - self._hostname = _hostname() - - # Dedup: track recent events to avoid duplicates - self._recent_events: Dict[str, float] = {} - self._dedup_window = 30 # seconds - - def start(self): - """Start the journal watcher thread.""" - if self._running: - return - self._running = True - self._thread = threading.Thread(target=self._watch_loop, daemon=True, - name='journal-watcher') - self._thread.start() - - def stop(self): - """Stop the journal watcher.""" - self._running = False - if self._process: - try: - self._process.terminate() - self._process.wait(timeout=5) - except Exception: - try: - self._process.kill() - except Exception: - pass - - def _watch_loop(self): - """Main watch loop with auto-restart on failure.""" - while self._running: - try: - self._run_journalctl() - except Exception as e: - print(f"[JournalWatcher] Error: {e}") - if self._running: - time.sleep(5) # Wait before restart - - def _run_journalctl(self): - """Run journalctl -f and process output line by line.""" - cmd = ['journalctl', '-f', '-o', 'json', '--no-pager', - '-n', '0'] # Start from now, don't replay history - - self._process = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, - text=True, bufsize=1 - ) - - for line in self._process.stdout: - if not self._running: - break - line = line.strip() - if not line: - continue - try: - entry = json.loads(line) - self._process_entry(entry) - except (json.JSONDecodeError, KeyError): - # Try plain text matching as fallback - self._process_plain(line) - - if self._process: - self._process.wait() - - def _process_entry(self, entry: Dict): - """Process a parsed journald JSON entry.""" - msg = entry.get('MESSAGE', '') - if not msg or not isinstance(msg, str): - return - - unit = entry.get('_SYSTEMD_UNIT', '') - syslog_id = entry.get('SYSLOG_IDENTIFIER', '') - priority = int(entry.get('PRIORITY', 6)) - - self._check_auth_failure(msg, syslog_id, entry) - self._check_fail2ban(msg, syslog_id) - self._check_kernel_critical(msg, syslog_id, priority) - self._check_service_failure(msg, unit) - self._check_disk_io(msg, syslog_id, priority) - self._check_cluster_events(msg, syslog_id) - self._check_system_shutdown(msg, syslog_id) - self._check_permission_change(msg, syslog_id) - self._check_firewall(msg, syslog_id) - - def _process_plain(self, line: str): - """Fallback: process a plain text log line.""" - self._check_auth_failure(line, '', {}) - self._check_fail2ban(line, '') - self._check_kernel_critical(line, '', 6) - self._check_cluster_events(line, '') - self._check_system_shutdown(line, '') - - # ── Detection methods ── - - def _check_auth_failure(self, msg: str, syslog_id: str, entry: Dict): - """Detect authentication failures (SSH, PAM, PVE).""" - patterns = [ - (r'Failed password for (?:invalid user )?(\S+) from (\S+)', 'ssh'), - (r'authentication failure.*rhost=(\S+).*user=(\S+)', 'pam'), - (r'pvedaemon\[.*authentication failure.*rhost=(\S+)', 'pve'), - ] - - for pattern, service in patterns: - match = re.search(pattern, msg, re.IGNORECASE) - if match: - groups = match.groups() - if service == 'ssh': - username, source_ip = groups[0], groups[1] - elif service == 'pam': - source_ip, username = groups[0], groups[1] - else: - source_ip = groups[0] - username = 'unknown' - - self._emit('auth_fail', 'WARNING', { - 'source_ip': source_ip, - 'username': username, - 'service': service, - 'hostname': self._hostname, - }, entity='user', entity_id=source_ip) - return - - def _check_fail2ban(self, msg: str, syslog_id: str): - """Detect Fail2Ban IP bans.""" - if 'fail2ban' not in msg.lower() and syslog_id != 'fail2ban-server': - return - - # Ban detected - ban_match = re.search(r'Ban\s+(\S+)', msg) - if ban_match: - ip = ban_match.group(1) - jail_match = re.search(r'\[(\w+)\]', msg) - jail = jail_match.group(1) if jail_match else 'unknown' - - self._emit('ip_block', 'INFO', { - 'source_ip': ip, - 'jail': jail, - 'failures': '', - 'hostname': self._hostname, - }, entity='user', entity_id=ip) - - def _check_kernel_critical(self, msg: str, syslog_id: str, priority: int): - """Detect kernel panics, OOM, segfaults, hardware errors.""" - # Only process messages from kernel or systemd (not app-level logs) - if syslog_id and syslog_id not in ('kernel', 'systemd', 'systemd-coredump', ''): - return - - # Filter out normal kernel messages that are NOT problems - _KERNEL_NOISE = [ - r'vfio-pci\s+\S+:\s*reset', # PCI passthrough resets (normal during VM start/stop) - r'vfio-pci\s+\S+:\s*resetting', - r'entered\s+(?:promiscuous|allmulticast)\s+mode', # Network bridge ops - r'entered\s+(?:blocking|forwarding|disabled)\s+state', # Bridge STP - r'tap\d+i\d+:', # TAP interface events - r'vmbr\d+:.*port\s+\d+', # Bridge port events - ] - for noise in _KERNEL_NOISE: - if re.search(noise, msg, re.IGNORECASE): - return - - # NOTE: Disk I/O errors (ATA, SCSI, blk_update_request) are NOT handled - # here. They are detected exclusively by HealthMonitor._check_disks_optimized - # which records to health_persistence -> PollingCollector -> notification. - # This avoids duplicate notifications and ensures the health dashboard - # stays in sync with notifications. - # Filesystem errors (EXT4/BTRFS/XFS/ZFS) ARE handled here because they - # indicate corruption, not just hardware I/O problems. - - critical_patterns = { - r'kernel panic': ('system_problem', 'CRITICAL', 'Kernel panic'), - r'Out of memory': ('system_problem', 'CRITICAL', 'Out of memory killer activated'), - r'segfault': ('system_problem', 'WARNING', 'Segmentation fault detected'), - r'BUG:': ('system_problem', 'CRITICAL', 'Kernel BUG detected'), - r'Call Trace:': ('system_problem', 'WARNING', 'Kernel call trace'), - r'EXT4-fs error': ('system_problem', 'CRITICAL', 'Filesystem error'), - r'BTRFS error': ('system_problem', 'CRITICAL', 'Filesystem error'), - r'XFS.*error': ('system_problem', 'CRITICAL', 'Filesystem error'), - r'ZFS.*error': ('system_problem', 'CRITICAL', 'ZFS pool error'), - r'mce:.*Hardware Error': ('system_problem', 'CRITICAL', 'Hardware error (MCE)'), - } - - for pattern, (event_type, severity, reason) in critical_patterns.items(): - if re.search(pattern, msg, re.IGNORECASE): - entity = 'node' - entity_id = '' - - # Build a context-rich reason from the journal message. - enriched = reason - - if 'segfault' in pattern: - m = re.search(r'(\S+)\[(\d+)\].*segfault', msg) - proc_name = m.group(1) if m else '' - proc_pid = m.group(2) if m else '' - lib_match = re.search(r'\bin\s+(\S+)', msg) - lib_name = lib_match.group(1) if lib_match else '' - - parts = [reason] - if proc_name: - parts.append(f"Process: {proc_name}" + (f" (PID {proc_pid})" if proc_pid else '')) - if lib_name: - parts.append(f"Module: {lib_name}") - enriched = '\n'.join(parts) - - elif 'Out of memory' in pattern: - m = re.search(r'Killed process\s+(\d+)\s+\(([^)]+)\)', msg) - if m: - enriched = f"{reason}\nKilled: {m.group(2)} (PID {m.group(1)})" - else: - enriched = f"{reason}\n{msg[:300]}" - - else: - # Generic: include the raw journal message for context - enriched = f"{reason}\n{msg[:300]}" - - data = {'reason': enriched, 'hostname': self._hostname} - - self._emit(event_type, severity, data, entity=entity, entity_id=entity_id) - return - - def _check_service_failure(self, msg: str, unit: str): - """Detect critical service failures with enriched context.""" - # Filter out noise -- these are normal systemd transient units, - # not real service failures worth alerting about. - _NOISE_PATTERNS = [ - r'session-\d+\.scope', # SSH/login sessions - r'user@\d+\.service', # Per-user service managers - r'user-runtime-dir@\d+', # User runtime dirs - r'systemd-coredump@', # Coredump handlers (transient) - r'run-.*\.mount', # Transient mounts - ] - for noise in _NOISE_PATTERNS: - if re.search(noise, msg) or re.search(noise, unit): - return - - service_patterns = [ - r'Failed to start (.+)', - r'Unit (\S+) (?:entered failed state|failed)', - r'(\S+)\.service: (?:Main process exited|Failed with result)', - ] - - for pattern in service_patterns: - match = re.search(pattern, msg) - if match: - service_name = match.group(1) - data = { - 'service_name': service_name, - 'reason': msg[:300], - 'hostname': self._hostname, - } - - # Enrich PVE VM/CT services with guest name and context - # pve-container@101 -> LXC container 101 - # qemu-server@100 -> QEMU VM 100 - pve_match = re.match( - r'(pve-container|qemu-server)@(\d+)', service_name) - if pve_match: - svc_type = pve_match.group(1) - vmid = pve_match.group(2) - vm_name = self._resolve_vm_name(vmid) - - if svc_type == 'pve-container': - guest_type = 'LXC container' - else: - guest_type = 'QEMU VM' - - display = f"{guest_type} {vmid}" - if vm_name: - display = f"{guest_type} {vmid} ({vm_name})" - - data['service_name'] = service_name - data['vmid'] = vmid - data['vmname'] = vm_name - data['guest_type'] = guest_type - data['display_name'] = display - data['reason'] = ( - f"{display} failed to start.\n{msg[:300]}" - ) - - self._emit('service_fail', 'WARNING', data, - entity='node', entity_id=service_name) - return - - def _resolve_vm_name(self, vmid: str) -> str: - """Try to resolve VMID to a guest name from PVE config files.""" - if not vmid: - return '' - # Check QEMU configs - for base in ['/etc/pve/qemu-server', '/etc/pve/lxc']: - conf = os.path.join(base, f'{vmid}.conf') - try: - with open(conf) as f: - for line in f: - if line.startswith('hostname:') or line.startswith('name:'): - return line.split(':', 1)[1].strip() - except (OSError, IOError): - continue - return '' - - def _check_disk_io(self, msg: str, syslog_id: str, priority: int): - """Detect disk I/O errors from kernel messages.""" - if syslog_id != 'kernel' and priority > 3: - return - - io_patterns = [ - r'blk_update_request: I/O error.*dev (\S+)', - r'Buffer I/O error on device (\S+)', - r'SCSI error.*sd(\w)', - r'ata\d+.*error', - ] - - for pattern in io_patterns: - match = re.search(pattern, msg) - if match: - device = match.group(1) if match.lastindex else 'unknown' - self._emit('disk_io_error', 'CRITICAL', { - 'device': device, - 'reason': msg[:200], - 'hostname': self._hostname, - }, entity='disk', entity_id=device) - return - - def _check_cluster_events(self, msg: str, syslog_id: str): - """Detect cluster split-brain and node disconnect.""" - msg_lower = msg.lower() - - # Split-brain - if any(p in msg_lower for p in ['split-brain', 'split brain', - 'fencing required', 'cluster partition']): - quorum = 'unknown' - if 'quorum' in msg_lower: - quorum = 'lost' if 'lost' in msg_lower else 'valid' - - self._emit('split_brain', 'CRITICAL', { - 'quorum': quorum, - 'reason': msg[:200], - 'hostname': self._hostname, - }, entity='cluster', entity_id=self._hostname) - return - - # Node disconnect - if (('quorum' in msg_lower and 'lost' in msg_lower) or - ('node' in msg_lower and any(w in msg_lower for w in ['left', 'offline', 'lost']))): - - node_match = re.search(r'[Nn]ode\s+(\S+)', msg) - node_name = node_match.group(1) if node_match else 'unknown' - - self._emit('node_disconnect', 'CRITICAL', { - 'node_name': node_name, - 'hostname': self._hostname, - }, entity='cluster', entity_id=node_name) - - def _check_system_shutdown(self, msg: str, syslog_id: str): - """Detect system shutdown/reboot. - - Matches multiple systemd signals that indicate the node is going down: - - "Shutting down." (systemd PID 1) - - "System is powering off." / "System is rebooting." - - "Reached target Shutdown." / "Reached target Reboot." - - "Journal stopped" (very late in shutdown) - - "The system will reboot now!" / "The system will power off now!" - """ - msg_lower = msg.lower() - - # Only process systemd / logind messages - if not any(s in syslog_id for s in ('systemd', 'logind', '')): - if 'systemd' not in msg_lower: - return - - is_reboot = False - is_shutdown = False - - # Detect reboot signals - reboot_signals = [ - 'system is rebooting', - 'reached target reboot', - 'the system will reboot now', - 'starting reboot', - ] - for sig in reboot_signals: - if sig in msg_lower: - is_reboot = True - break - - # Detect shutdown/poweroff signals - if not is_reboot: - shutdown_signals = [ - 'system is powering off', - 'system is halting', - 'shutting down', - 'reached target shutdown', - 'reached target halt', - 'the system will power off now', - 'starting power-off', - 'journal stopped', - 'stopping journal service', - ] - for sig in shutdown_signals: - if sig in msg_lower: - is_shutdown = True - break - - if is_reboot: - self._emit('system_reboot', 'CRITICAL', { - 'reason': msg[:200], - 'hostname': self._hostname, - }, entity='node', entity_id='') - elif is_shutdown: - self._emit('system_shutdown', 'CRITICAL', { - 'reason': msg[:200], - 'hostname': self._hostname, - }, entity='node', entity_id='') - - def _check_permission_change(self, msg: str, syslog_id: str): - """Detect user permission changes in PVE.""" - permission_patterns = [ - (r'set permissions.*user\s+(\S+)', 'Permission changed'), - (r'user added to group.*?(\S+)', 'Added to group'), - (r'user removed from group.*?(\S+)', 'Removed from group'), - (r'ACL updated.*?(\S+)', 'ACL updated'), - (r'Role assigned.*?(\S+)', 'Role assigned'), - ] - - for pattern, action in permission_patterns: - match = re.search(pattern, msg, re.IGNORECASE) - if match: - username = match.group(1) - self._emit('user_permission_change', 'INFO', { - 'username': username, - 'change_details': action, - 'hostname': self._hostname, - }, entity='user', entity_id=username) - return - - def _check_firewall(self, msg: str, syslog_id: str): - """Detect firewall issues (not individual drops, but rule errors).""" - if re.search(r'pve-firewall.*(?:error|failed|unable)', msg, re.IGNORECASE): - self._emit('firewall_issue', 'WARNING', { - 'reason': msg[:200], - 'hostname': self._hostname, - }, entity='network', entity_id='') - - # ── Emit helper ── - - def _emit(self, event_type: str, severity: str, data: Dict, - entity: str = 'node', entity_id: str = ''): - """Emit event to queue with short-term deduplication (30s window).""" - event = NotificationEvent( - event_type, severity, data, source='journal', - entity=entity, entity_id=entity_id, - ) - - now = time.time() - last = self._recent_events.get(event.fingerprint, 0) - if now - last < self._dedup_window: - return # Skip duplicate within 30s window - - self._recent_events[event.fingerprint] = now - - # Cleanup old dedup entries periodically - if len(self._recent_events) > 200: - cutoff = now - self._dedup_window * 2 - self._recent_events = { - k: v for k, v in self._recent_events.items() if v > cutoff - } - - self._queue.put(event) - - -# ─── Task Watcher (Real-time) ──────────────────────────────────── - -class TaskWatcher: - """Watches /var/log/pve/tasks/index for VM/CT and backup events. - - The PVE task index file is appended when tasks start/finish. - Format: UPID:node:pid:pstart:starttime:type:id:user: - Final status is recorded when task completes. - """ - - TASK_LOG = '/var/log/pve/tasks/index' - - # Map PVE task types to our event types - TASK_MAP = { - 'qmstart': ('vm_start', 'INFO'), - 'qmstop': ('vm_stop', 'INFO'), - 'qmshutdown': ('vm_shutdown', 'INFO'), - 'qmreboot': ('vm_restart', 'INFO'), - 'qmreset': ('vm_restart', 'INFO'), - 'vzstart': ('ct_start', 'INFO'), - 'vzstop': ('ct_stop', 'INFO'), - 'vzshutdown': ('ct_shutdown', 'INFO'), - 'vzreboot': ('ct_restart', 'INFO'), - 'vzdump': ('backup_start', 'INFO'), - 'qmsnapshot': ('snapshot_complete', 'INFO'), - 'vzsnapshot': ('snapshot_complete', 'INFO'), - 'qmigrate': ('migration_start', 'INFO'), - 'vzmigrate': ('migration_start', 'INFO'), - } - - def __init__(self, event_queue: Queue): - self._queue = event_queue - self._running = False - self._thread: Optional[threading.Thread] = None - self._hostname = _hostname() - self._last_position = 0 - # Cache for active vzdump detection - self._vzdump_active_cache: float = 0 # timestamp of last positive check - self._vzdump_cache_ttl = 5 # cache result for 5s - - def start(self): - if self._running: - return - self._running = True - - # Start at end of file - if os.path.exists(self.TASK_LOG): - try: - self._last_position = os.path.getsize(self.TASK_LOG) - except OSError: - self._last_position = 0 - - self._thread = threading.Thread(target=self._watch_loop, daemon=True, - name='task-watcher') - self._thread.start() - - def stop(self): - self._running = False - - def _is_vzdump_active(self) -> bool: - """Check if a vzdump (backup) job is currently running. - - Reads /var/log/pve/tasks/active which lists all running PVE tasks. - Also verifies the process is actually alive (PID check). - Result is cached for a few seconds to avoid excessive file reads. - """ - now = time.time() - # Negative cache: if we recently confirmed NO vzdump, skip the check - if hasattr(self, '_vzdump_negative_cache') and \ - now - self._vzdump_negative_cache < self._vzdump_cache_ttl: - return False - # Positive cache - if now - self._vzdump_active_cache < self._vzdump_cache_ttl: - return True - - active_file = '/var/log/pve/tasks/active' - try: - with open(active_file, 'r') as f: - for line in f: - # UPID format: UPID:node:pid:pstart:starttime:type:id:user: - if ':vzdump:' in line: - # Verify the PID is still alive - parts = line.strip().split(':') - if len(parts) >= 3: - try: - pid = int(parts[2]) - os.kill(pid, 0) # Signal 0 = just check existence - self._vzdump_active_cache = now - return True - except (ValueError, ProcessLookupError, PermissionError): - pass # PID not found or not a number -- stale entry - except (OSError, IOError): - pass - - self._vzdump_negative_cache = now - return False - - def _watch_loop(self): - """Poll the task index file for new entries.""" - while self._running: - try: - if os.path.exists(self.TASK_LOG): - current_size = os.path.getsize(self.TASK_LOG) - - if current_size < self._last_position: - # File was truncated/rotated - self._last_position = 0 - - if current_size > self._last_position: - with open(self.TASK_LOG, 'r') as f: - f.seek(self._last_position) - new_lines = f.readlines() - self._last_position = f.tell() - - for line in new_lines: - self._process_task_line(line.strip()) - except Exception as e: - print(f"[TaskWatcher] Error reading task log: {e}") - - time.sleep(2) # Check every 2 seconds - - def _process_task_line(self, line: str): - """Process a single task index line. - - PVE task index format (space-separated): - UPID endtime status - Where UPID = UPID:node:pid:pstart:starttime:type:id:user: - """ - if not line: - return - - parts = line.split() - if not parts: - return - - upid = parts[0] - status = parts[2] if len(parts) >= 3 else '' - - # Parse UPID - upid_parts = upid.split(':') - if len(upid_parts) < 8: - return - - task_type = upid_parts[5] - vmid = upid_parts[6] - user = upid_parts[7] - - # Get VM/CT name - vmname = self._get_vm_name(vmid) if vmid else '' - - # Map to event type - event_info = self.TASK_MAP.get(task_type) - if not event_info: - return - - event_type, default_severity = event_info - - - - # Check if task failed - is_error = status and status != 'OK' and status != '' - - if is_error: - # Override to failure event - if 'start' in event_type: - event_type = event_type.replace('_start', '_fail') - elif 'complete' in event_type: - event_type = event_type.replace('_complete', '_fail') - severity = 'CRITICAL' - elif status == 'OK': - # Task completed successfully - if event_type == 'backup_start': - event_type = 'backup_complete' - elif event_type == 'migration_start': - event_type = 'migration_complete' - severity = 'INFO' - else: - # Task just started (no status yet) - severity = default_severity - - data = { - 'vmid': vmid, - 'vmname': vmname or f'ID {vmid}', - 'hostname': self._hostname, - 'user': user, - 'reason': status if is_error else '', - 'target_node': '', - 'size': '', - 'snapshot_name': '', - } - - # Determine entity type from task type - entity = 'ct' if task_type.startswith('vz') else 'vm' - - # Backup and replication events are handled EXCLUSIVELY by the PVE - # webhook, which delivers much richer data (full logs, sizes, durations, - # filenames). TaskWatcher skips these entirely to avoid duplicates. - _WEBHOOK_EXCLUSIVE = {'backup_complete', 'backup_fail', 'backup_start', - 'replication_complete', 'replication_fail'} - if event_type in _WEBHOOK_EXCLUSIVE: - return - - # Suppress VM/CT start/stop/shutdown while a vzdump is active. - # These are backup-induced operations (mode=stop), not user actions. - # Exception: if a VM/CT FAILS to start after backup, that IS important. - _BACKUP_NOISE = {'vm_start', 'vm_stop', 'vm_shutdown', 'vm_restart', - 'ct_start', 'ct_stop', 'ct_shutdown', 'ct_restart'} - if event_type in _BACKUP_NOISE and not is_error: - if self._is_vzdump_active(): - return - - self._queue.put(NotificationEvent( - event_type, severity, data, source='tasks', - entity=entity, entity_id=vmid, - )) - - def _get_vm_name(self, vmid: str) -> str: - """Try to resolve VMID to name via config files.""" - if not vmid: - return '' - - # Try QEMU - conf_path = f'/etc/pve/qemu-server/{vmid}.conf' - name = self._read_name_from_conf(conf_path) - if name: - return name - - # Try LXC - conf_path = f'/etc/pve/lxc/{vmid}.conf' - name = self._read_name_from_conf(conf_path) - if name: - return name - - return '' - - @staticmethod - def _read_name_from_conf(path: str) -> str: - """Read 'name:' or 'hostname:' from PVE config file.""" - try: - if not os.path.exists(path): - return '' - with open(path, 'r') as f: - for line in f: - if line.startswith('name:'): - return line.split(':', 1)[1].strip() - if line.startswith('hostname:'): - return line.split(':', 1)[1].strip() - except (IOError, PermissionError): - pass - return '' - - -# ─── Polling Collector ──────────────────────────────────────────── - -class PollingCollector: - """Periodic collector that polls health state independently. - - Architecture: - - Completely independent from Health Monitor's suppression system. - Suppression Duration only affects the UI health badge; it NEVER blocks - notifications. - - Reads ``get_active_errors()`` (ALL active errors, even suppressed ones) - and decides when to notify based on its own 24-hour cycle. - - For *new* errors (first_seen within the last poll interval), notifies - immediately. - - For *persistent* errors (already known), re-notifies once every 24 h. - - Update checks run on their own 24-h timer and include security counts. - - Tracking is stored in ``notification_last_sent`` (same DB). - """ - - DIGEST_INTERVAL = 86400 # 24 h between re-notifications - UPDATE_CHECK_INTERVAL = 86400 # 24 h between update scans - NEW_ERROR_WINDOW = 120 # seconds – errors younger than this are "new" - - _ENTITY_MAP = { - 'cpu': ('node', ''), 'memory': ('node', ''), 'temperature': ('node', ''), - 'disk': ('storage', ''), 'network': ('network', ''), - 'pve_services': ('node', ''), 'security': ('user', ''), - 'updates': ('node', ''), 'storage': ('storage', ''), - } - - # Map health-persistence category names to our TEMPLATES event types. - # These must match keys in notification_templates.TEMPLATES exactly. - _CATEGORY_TO_EVENT_TYPE = { - 'cpu': 'cpu_high', - 'memory': 'ram_high', - 'load': 'load_high', - 'temperature': 'temp_high', - 'disk': 'disk_space_low', - 'storage': 'storage_unavailable', - 'network': 'network_down', - 'pve_services': 'service_fail', - 'security': 'auth_fail', - 'updates': 'update_available', - 'zfs': 'disk_io_error', - 'smart': 'disk_io_error', - 'disks': 'disk_io_error', - 'logs': 'system_problem', - 'vms': 'system_problem', - } - - def __init__(self, event_queue: Queue, poll_interval: int = 60): - self._queue = event_queue - self._running = False - self._thread: Optional[threading.Thread] = None - self._poll_interval = poll_interval - self._hostname = _hostname() - self._last_update_check = 0 - # In-memory cache: error_key -> last notification timestamp - self._last_notified: Dict[str, float] = {} - # Track known error keys so we can detect truly new ones - self._known_errors: set = set() - self._first_poll_done = False - - def start(self): - if self._running: - return - self._running = True - self._load_last_notified() - self._thread = threading.Thread(target=self._poll_loop, daemon=True, - name='polling-collector') - self._thread.start() - - def stop(self): - self._running = False - - # ── Main loop ────────────────────────────────────────────── - - def _poll_loop(self): - """Main polling loop.""" - # Initial delay to let health monitor warm up - for _ in range(15): - if not self._running: - return - time.sleep(1) - - while self._running: - try: - self._check_persistent_health() - self._check_updates() - except Exception as e: - print(f"[PollingCollector] Error: {e}") - - for _ in range(self._poll_interval): - if not self._running: - return - time.sleep(1) - - # ── Health errors (independent of suppression) ───────────── - - def _check_persistent_health(self): - """Read ALL active errors from health_persistence and decide - whether each one warrants a notification right now. - - Rules: - - A *new* error (not in _known_errors) -> notify immediately - - A *persistent* error already notified -> re-notify after 24 h - - Uses its own tracking, NOT the health monitor's needs_notification flag - """ - try: - from health_persistence import health_persistence - errors = health_persistence.get_active_errors() - except ImportError: - return - except Exception as e: - print(f"[PollingCollector] get_active_errors failed: {e}") - return - - now = time.time() - current_keys = set() - - for error in errors: - error_key = error.get('error_key', '') - if not error_key: - continue - - current_keys.add(error_key) - category = error.get('category', '') - severity = error.get('severity', 'WARNING') - reason = error.get('reason', '') - - # Determine if we should notify - is_new = error_key not in self._known_errors and self._first_poll_done - last_sent = self._last_notified.get(error_key, 0) - is_due = (now - last_sent) >= self.DIGEST_INTERVAL - - if not is_new and not is_due: - continue - - # Map to our event type - event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem') - entity, eid = self._ENTITY_MAP.get(category, ('node', '')) - - data = { - 'hostname': self._hostname, - 'category': category, - 'reason': reason, - 'error_key': error_key, - 'severity': severity, - 'first_seen': error.get('first_seen', ''), - 'last_seen': error.get('last_seen', ''), - 'is_persistent': not is_new, - } - - # Include extra details if present - details = error.get('details') - if isinstance(details, dict): - data.update(details) - elif isinstance(details, str): - try: - data.update(json.loads(details)) - except (json.JSONDecodeError, TypeError): - pass - - self._queue.put(NotificationEvent( - event_type, severity, data, source='health', - entity=entity, entity_id=eid or error_key, - )) - - # Track that we notified - self._last_notified[error_key] = now - self._persist_last_notified(error_key, now) - - # Remove tracking for errors that resolved - resolved = self._known_errors - current_keys - for key in resolved: - self._last_notified.pop(key, None) - - self._known_errors = current_keys - self._first_poll_done = True - - # ── Update check (enriched) ──────────────────────────────── - - def _check_updates(self): - """Check for available system updates every 24 h. - - Enriched output: total count, security updates, PVE version hint, - and top package names. - """ - now = time.time() - if now - self._last_update_check < self.UPDATE_CHECK_INTERVAL: - return - - self._last_update_check = now - - try: - result = subprocess.run( - ['apt-get', '-s', 'upgrade'], - capture_output=True, text=True, timeout=60, - ) - if result.returncode != 0: - return - - lines = [l for l in result.stdout.split('\n') if l.startswith('Inst ')] - total = len(lines) - if total == 0: - return - - packages = [l.split()[1] for l in lines] - security = [p for p in packages if any( - kw in p.lower() for kw in ('security', 'cve', 'openssl', 'libssl') - )] - - # Also detect security updates via apt changelog / Debian-Security origin - sec_result = subprocess.run( - ['apt-get', '-s', 'upgrade', '-o', 'Dir::Etc::SourceList=/dev/null', - '-o', 'Dir::Etc::SourceParts=/dev/null'], - capture_output=True, text=True, timeout=30, - ) - # Count lines from security repo (rough heuristic) - sec_count = max(len(security), 0) - try: - sec_output = subprocess.run( - ['apt-get', '-s', '--only-upgrade', 'install'] + packages[:50], - capture_output=True, text=True, timeout=30, - ) - for line in sec_output.stdout.split('\n'): - if 'security' in line.lower() and 'Inst ' in line: - sec_count += 1 - except Exception: - pass - - # Check for PVE version upgrade - pve_packages = [p for p in packages if 'pve-' in p.lower() or 'proxmox-' in p.lower()] - - # Build display details - top_pkgs = packages[:8] - details = ', '.join(top_pkgs) - if total > 8: - details += f', ... +{total - 8} more' - - data = { - 'hostname': self._hostname, - 'count': str(total), - 'security_count': str(sec_count), - 'details': details, - 'packages': ', '.join(packages[:20]), - } - if pve_packages: - data['pve_packages'] = ', '.join(pve_packages) - - self._queue.put(NotificationEvent( - 'update_available', 'INFO', data, - source='polling', entity='node', entity_id='', - )) - except Exception: - pass - - # ── Persistence helpers ──────────────────────────────────── - - def _load_last_notified(self): - """Load per-error notification timestamps from DB on startup.""" - try: - db_path = Path('/usr/local/share/proxmenux/health_monitor.db') - if not db_path.exists(): - return - conn = sqlite3.connect(str(db_path), timeout=10) - conn.execute('PRAGMA journal_mode=WAL') - cursor = conn.cursor() - cursor.execute( - "SELECT fingerprint, last_sent_ts FROM notification_last_sent " - "WHERE fingerprint LIKE 'health_%'" - ) - for fp, ts in cursor.fetchall(): - error_key = fp.replace('health_', '', 1) - self._last_notified[error_key] = ts - self._known_errors.add(error_key) - conn.close() - except Exception as e: - print(f"[PollingCollector] Failed to load last_notified: {e}") - - def _persist_last_notified(self, error_key: str, ts: float): - """Save per-error notification timestamp to DB.""" - try: - db_path = Path('/usr/local/share/proxmenux/health_monitor.db') - conn = sqlite3.connect(str(db_path), timeout=10) - conn.execute('PRAGMA journal_mode=WAL') - conn.execute('PRAGMA busy_timeout=5000') - fp = f'health_{error_key}' - conn.execute(''' - INSERT OR REPLACE INTO notification_last_sent (fingerprint, last_sent_ts, count) - VALUES (?, ?, COALESCE( - (SELECT count + 1 FROM notification_last_sent WHERE fingerprint = ?), 1 - )) - ''', (fp, int(ts), fp)) - conn.commit() - conn.close() - except Exception: - pass - - -# ─── Proxmox Webhook Receiver ─────────────────────────────────── - -class ProxmoxHookWatcher: - """Receives native Proxmox VE notifications via local webhook endpoint. - - Configured automatically via /etc/pve/notifications.cfg (endpoint + - matcher blocks). The setup-webhook API writes these blocks on first - enable. See flask_notification_routes.py for details. - - Payload varies by source (storage, replication, cluster, PBS, apt). - This class normalizes them into NotificationEvent objects. - """ - - def __init__(self, event_queue: Queue): - self._queue = event_queue - self._hostname = _hostname() - - def process_webhook(self, payload: dict) -> dict: - """Process an incoming Proxmox webhook payload. - - The PVE webhook is the PRIMARY source for vzdump, replication, - fencing, package-updates and system-mail events. PVE sends rich - detail (full logs, sizes, durations) that TaskWatcher cannot match. - - Body template delivers: - {title, message, severity, timestamp, fields: {type, hostname, job-id}} - - Returns: {'accepted': bool, 'event_type': str, 'event_id': str} - """ - if not payload: - return {'accepted': False, 'error': 'Empty payload'} - - # ── Extract structured PVE fields ── - fields = payload.get('fields') or {} - if isinstance(fields, str): - # Edge case: {{ json fields }} rendered as string instead of dict - try: - import json - fields = json.loads(fields) - except (json.JSONDecodeError, ValueError): - fields = {} - - pve_type = fields.get('type', '').lower().strip() - pve_hostname = fields.get('hostname', self._hostname) - pve_job_id = fields.get('job-id', '') - - title = payload.get('title', '') - message = payload.get('message', payload.get('body', '')) - severity_raw = payload.get('severity', 'info').lower().strip() - timestamp = payload.get('timestamp', '') - - # ── Classify by PVE type (direct, no heuristics needed) ── - import re - event_type, entity, entity_id = self._classify_pve( - pve_type, severity_raw, title, message - ) - - # Discard meta-events - if event_type == '_skip': - return {'accepted': False, 'skipped': True, 'reason': 'Meta-event filtered'} - - severity = self._map_severity(severity_raw) - - # ── Build rich data dict ── - # For webhook events, PVE's `message` IS the notification body. - # It contains full vzdump logs, package lists, error details, etc. - # We pass it as 'pve_message' so templates can use it directly. - data = { - 'hostname': pve_hostname, - 'pve_type': pve_type, - 'pve_message': message, - 'pve_title': title, - 'title': title, - 'job_id': pve_job_id, - } - - # Extract VMID and VM name from message for vzdump events - if pve_type == 'vzdump' and message: - # PVE vzdump messages contain lines like: - # "INFO: Starting Backup of VM 100 (qemu)" - # "VMID Name Status Time Size Filename" - # "100 arch-linux OK 00:05:30 1.2G /path/to/file" - vmids = re.findall(r'(?:VM|CT)\s+(\d+)', message, re.IGNORECASE) - if vmids: - data['vmid'] = vmids[0] - entity_id = vmids[0] - # Try to extract VM name from the table line - name_m = re.search(r'(\d+)\s+(\S+)\s+(?:OK|ERROR|WARNINGS)', message) - if name_m: - data['vmname'] = name_m.group(2) - # Extract size from "Total size: X" - size_m = re.search(r'Total size:\s*(.+?)(?:\n|$)', message) - if size_m: - data['size'] = size_m.group(1).strip() - # Extract duration from "Total running time: X" - dur_m = re.search(r'Total running time:\s*(.+?)(?:\n|$)', message) - if dur_m: - data['duration'] = dur_m.group(1).strip() - - event = NotificationEvent( - event_type=event_type, - severity=severity, - data=data, - source='proxmox_hook', - entity=entity, - entity_id=entity_id, - raw=payload, - ) - - self._queue.put(event) - return {'accepted': True, 'event_type': event_type, 'event_id': event.event_id} - - def _classify_pve(self, pve_type: str, severity: str, - title: str, message: str) -> tuple: - """Classify using PVE's structured fields.type. - - Returns (event_type, entity, entity_id). - """ - title_lower = (title or '').lower() - - # Skip overall/updates status change meta-events - if 'overall' in title_lower and ('changed' in title_lower or 'status' in title_lower): - return '_skip', '', '' - if 'updates' in title_lower and ('changed' in title_lower or 'status' in title_lower): - return '_skip', '', '' - - # ── Direct classification by PVE type ── - if pve_type == 'vzdump': - if severity in ('error', 'err'): - return 'backup_fail', 'vm', '' - return 'backup_complete', 'vm', '' - - if pve_type == 'fencing': - return 'split_brain', 'node', '' - - if pve_type == 'replication': - return 'replication_fail', 'vm', '' - - if pve_type == 'package-updates': - return 'update_available', 'node', '' - - if pve_type == 'system-mail': - return 'system_mail', 'node', '' - - # ── Fallback for unknown/empty pve_type ── - # (e.g. test notifications, future PVE event types) - msg_lower = (message or '').lower() - text = f"{title_lower} {msg_lower}" - - if 'vzdump' in text or 'backup' in text: - import re - m = re.search(r'(?:vm|ct)\s+(\d+)', text, re.IGNORECASE) - vmid = m.group(1) if m else '' - if any(w in text for w in ('fail', 'error')): - return 'backup_fail', 'vm', vmid - return 'backup_complete', 'vm', vmid - - if 'replication' in text: - return 'replication_fail', 'vm', '' - - # Generic fallback - return 'system_problem', 'node', '' - - # Old _classify removed -- replaced by _classify_pve above. - - @staticmethod - def _map_severity(raw: str) -> str: - raw_l = str(raw).lower() - if raw_l in ('critical', 'emergency', 'alert', 'crit', 'err', 'error'): - return 'CRITICAL' - if raw_l in ('warning', 'warn', 'notice'): - return 'WARNING' - return 'INFO' diff --git a/AppImage/scripts/notification_manager.py b/AppImage/scripts/notification_manager.py deleted file mode 100644 index 3b2bed92..00000000 --- a/AppImage/scripts/notification_manager.py +++ /dev/null @@ -1,1283 +0,0 @@ -""" -ProxMenux Notification Manager -Central orchestrator for the notification service. - -Connects: -- notification_channels.py (transport: Telegram, Gotify, Discord) -- notification_templates.py (message formatting + optional AI) -- notification_events.py (event detection: Journal, Task, Polling watchers) -- health_persistence.py (DB: config storage, notification_history) - -Two interfaces consume this module: -1. Server mode: Flask imports and calls start()/stop()/send_notification() -2. CLI mode: `python3 notification_manager.py --action send --type vm_fail ...` - Scripts .sh in /usr/local/share/proxmenux/scripts call this directly. - -Author: MacRimi -""" - -import json -import os -import sys -import time -import socket -import sqlite3 -import threading -from queue import Queue, Empty -from datetime import datetime -from typing import Dict, Any, List, Optional -from pathlib import Path - -# Ensure local imports work -BASE_DIR = os.path.dirname(os.path.abspath(__file__)) -if BASE_DIR not in sys.path: - sys.path.insert(0, BASE_DIR) - -from notification_channels import create_channel, CHANNEL_TYPES -from notification_templates import ( - render_template, format_with_ai, TEMPLATES, - EVENT_GROUPS, get_event_types_by_group, get_default_enabled_events -) -from notification_events import ( - JournalWatcher, TaskWatcher, PollingCollector, NotificationEvent, - ProxmoxHookWatcher, -) - - -# ─── Constants ──────────────────────────────────────────────────── - -DB_PATH = Path('/usr/local/share/proxmenux/health_monitor.db') -SETTINGS_PREFIX = 'notification.' - -# Cooldown defaults (seconds) -DEFAULT_COOLDOWNS = { - 'CRITICAL': 60, # 60s minimum (prevents storm, delivers fast) - 'WARNING': 300, # 5 min - 'INFO': 900, # 15 min - 'resources': 900, # 15 min for resource alerts - 'updates': 86400, # 24h for update notifications -} - - -# ─── Storm Protection ──────────────────────────────────────────── - -GROUP_RATE_LIMITS = { - 'security': {'max_per_minute': 5, 'max_per_hour': 30}, - 'storage': {'max_per_minute': 3, 'max_per_hour': 20}, - 'cluster': {'max_per_minute': 5, 'max_per_hour': 20}, - 'network': {'max_per_minute': 3, 'max_per_hour': 15}, - 'resources': {'max_per_minute': 3, 'max_per_hour': 20}, - 'vm_ct': {'max_per_minute': 10, 'max_per_hour': 60}, - 'backup': {'max_per_minute': 5, 'max_per_hour': 30}, - 'system': {'max_per_minute': 5, 'max_per_hour': 30}, -} - - -class GroupRateLimiter: - """Rate limiter per event group. Prevents notification storms.""" - - def __init__(self): - from collections import deque - self._deque = deque - self._minute_counts: Dict[str, Any] = {} # group -> deque[timestamp] - self._hour_counts: Dict[str, Any] = {} # group -> deque[timestamp] - - def allow(self, group: str) -> bool: - """Check if group rate limit allows this event.""" - limits = GROUP_RATE_LIMITS.get(group, GROUP_RATE_LIMITS['system']) - now = time.time() - - # Initialize if needed - if group not in self._minute_counts: - self._minute_counts[group] = self._deque() - self._hour_counts[group] = self._deque() - - # Prune old entries - minute_q = self._minute_counts[group] - hour_q = self._hour_counts[group] - while minute_q and now - minute_q[0] > 60: - minute_q.popleft() - while hour_q and now - hour_q[0] > 3600: - hour_q.popleft() - - # Check limits - if len(minute_q) >= limits['max_per_minute']: - return False - if len(hour_q) >= limits['max_per_hour']: - return False - - # Record - minute_q.append(now) - hour_q.append(now) - return True - - def get_stats(self) -> Dict[str, Dict[str, int]]: - """Return current rate stats per group.""" - now = time.time() - stats = {} - for group in self._minute_counts: - minute_q = self._minute_counts.get(group, []) - hour_q = self._hour_counts.get(group, []) - stats[group] = { - 'last_minute': sum(1 for t in minute_q if now - t <= 60), - 'last_hour': sum(1 for t in hour_q if now - t <= 3600), - } - return stats - - -AGGREGATION_RULES = { - 'auth_fail': {'window': 120, 'min_count': 3, 'burst_type': 'burst_auth_fail'}, - 'ip_block': {'window': 120, 'min_count': 3, 'burst_type': 'burst_ip_block'}, - 'disk_io_error': {'window': 60, 'min_count': 3, 'burst_type': 'burst_disk_io'}, - 'split_brain': {'window': 300, 'min_count': 2, 'burst_type': 'burst_cluster'}, - 'node_disconnect': {'window': 300, 'min_count': 2, 'burst_type': 'burst_cluster'}, -} - - -class BurstAggregator: - """Accumulates similar events in a time window, then sends a single summary. - - Examples: - - "Fail2Ban banned 17 IPs in 2 minutes" - - "Disk I/O errors: 34 events on /dev/sdb in 60s" - """ - - def __init__(self): - self._buckets: Dict[str, List] = {} # bucket_key -> [events] - self._deadlines: Dict[str, float] = {} # bucket_key -> flush_deadline - self._lock = threading.Lock() - - def ingest(self, event: NotificationEvent) -> Optional[NotificationEvent]: - """Add event to aggregation. Returns: - - None if event is being buffered (wait for window) - - Original event if not eligible for aggregation - """ - rule = AGGREGATION_RULES.get(event.event_type) - if not rule: - return event # Not aggregable, pass through - - bucket_key = f"{event.event_type}:{event.data.get('hostname', '')}" - - with self._lock: - if bucket_key not in self._buckets: - self._buckets[bucket_key] = [] - self._deadlines[bucket_key] = time.time() + rule['window'] - - self._buckets[bucket_key].append(event) - - # First event in bucket: pass through immediately so user gets fast alert - if len(self._buckets[bucket_key]) == 1: - return event - - # Subsequent events: buffer (will be flushed as summary) - return None - - def flush_expired(self) -> List[NotificationEvent]: - """Flush all buckets past their deadline. Returns summary events.""" - now = time.time() - summaries = [] - - with self._lock: - expired_keys = [k for k, d in self._deadlines.items() if now >= d] - - for key in expired_keys: - events = self._buckets.pop(key, []) - del self._deadlines[key] - - if len(events) < 2: - continue # Single event already sent on ingest, no summary needed - - rule_type = key.split(':')[0] - rule = AGGREGATION_RULES.get(rule_type, {}) - min_count = rule.get('min_count', 2) - - if len(events) < min_count: - continue # Not enough events for a summary - - summary = self._create_summary(events, rule) - if summary: - summaries.append(summary) - - return summaries - - def _create_summary(self, events: List[NotificationEvent], - rule: dict) -> Optional[NotificationEvent]: - """Create a single summary event from multiple events.""" - if not events: - return None - - first = events[0] - # Determine highest severity - sev_order = {'INFO': 0, 'WARNING': 1, 'CRITICAL': 2} - max_severity = max(events, key=lambda e: sev_order.get(e.severity, 0)).severity - - # Collect unique entity_ids - entity_ids = list(set(e.entity_id for e in events if e.entity_id)) - entity_list = ', '.join(entity_ids[:10]) if entity_ids else 'multiple sources' - if len(entity_ids) > 10: - entity_list += f' (+{len(entity_ids) - 10} more)' - - # Calculate window - window_secs = events[-1].ts_epoch - events[0].ts_epoch - if window_secs < 120: - window_str = f'{int(window_secs)}s' - else: - window_str = f'{int(window_secs / 60)}m' - - burst_type = rule.get('burst_type', 'burst_generic') - - data = { - 'hostname': first.data.get('hostname', socket.gethostname()), - 'count': str(len(events)), - 'window': window_str, - 'entity_list': entity_list, - 'event_type': first.event_type, - } - - return NotificationEvent( - event_type=burst_type, - severity=max_severity, - data=data, - source='aggregator', - entity=first.entity, - entity_id='burst', - ) - - -# ─── Notification Manager ───────────────────────────────────────── - -class NotificationManager: - """Central notification orchestrator. - - Manages channels, event watchers, deduplication, and dispatch. - Can run in server mode (background threads) or CLI mode (one-shot). - """ - - def __init__(self): - self._channels: Dict[str, Any] = {} # channel_name -> channel_instance - self._event_queue: Queue = Queue() - self._running = False - self._config: Dict[str, str] = {} - self._enabled = False - self._lock = threading.Lock() - - # Watchers - self._journal_watcher: Optional[JournalWatcher] = None - self._task_watcher: Optional[TaskWatcher] = None - self._polling_collector: Optional[PollingCollector] = None - self._dispatch_thread: Optional[threading.Thread] = None - - # Webhook receiver (no thread, passive) - self._hook_watcher: Optional[ProxmoxHookWatcher] = None - - # Cooldown tracking: {fingerprint: last_sent_timestamp} - self._cooldowns: Dict[str, float] = {} - - # Storm protection - self._group_limiter = GroupRateLimiter() - self._aggregator = BurstAggregator() - self._aggregation_thread: Optional[threading.Thread] = None - - # Stats - self._stats = { - 'started_at': None, - 'total_sent': 0, - 'total_errors': 0, - 'last_sent_at': None, - } - - # ─── Configuration ────────────────────────────────────────── - - def _load_config(self): - """Load notification settings from the shared SQLite database.""" - self._config = {} - try: - if not DB_PATH.exists(): - return - - conn = sqlite3.connect(str(DB_PATH), timeout=10) - conn.execute('PRAGMA journal_mode=WAL') - conn.execute('PRAGMA busy_timeout=5000') - cursor = conn.cursor() - cursor.execute( - 'SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?', - (f'{SETTINGS_PREFIX}%',) - ) - for key, value in cursor.fetchall(): - # Strip prefix for internal use - short_key = key[len(SETTINGS_PREFIX):] - self._config[short_key] = value - conn.close() - except Exception as e: - print(f"[NotificationManager] Failed to load config: {e}") - - # Reconcile per-event toggles with current template defaults. - # If a template's default_enabled was changed (e.g. state_change False), - # but the DB has a stale 'true' from a previous default, fix it now. - # Only override if the user hasn't explicitly set it (we track this with - # a sentinel: if the value came from auto-save of defaults, it may be stale). - for event_type, tmpl in TEMPLATES.items(): - key = f'event.{event_type}' - if key in self._config: - db_val = self._config[key] == 'true' - tmpl_default = tmpl.get('default_enabled', True) - # If template says disabled but DB says enabled, AND there's no - # explicit user marker, enforce the template default. - if not tmpl_default and db_val: - # Check if user explicitly enabled it (look for a marker) - marker = f'event_explicit.{event_type}' - if marker not in self._config: - self._config[key] = 'false' - - self._enabled = self._config.get('enabled', 'false') == 'true' - self._rebuild_channels() - - def _save_setting(self, key: str, value: str): - """Save a single notification setting to the database.""" - full_key = f'{SETTINGS_PREFIX}{key}' - now = datetime.now().isoformat() - try: - conn = sqlite3.connect(str(DB_PATH), timeout=10) - conn.execute('PRAGMA journal_mode=WAL') - conn.execute('PRAGMA busy_timeout=5000') - cursor = conn.cursor() - cursor.execute(''' - INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at) - VALUES (?, ?, ?) - ''', (full_key, value, now)) - conn.commit() - conn.close() - self._config[key] = value - except Exception as e: - print(f"[NotificationManager] Failed to save setting {key}: {e}") - - def _rebuild_channels(self): - """Rebuild channel instances from current config.""" - self._channels = {} - - for ch_type in CHANNEL_TYPES: - enabled_key = f'{ch_type}.enabled' - if self._config.get(enabled_key) != 'true': - continue - - # Gather config keys for this channel - ch_config = {} - for config_key in CHANNEL_TYPES[ch_type]['config_keys']: - full_key = f'{ch_type}.{config_key}' - ch_config[config_key] = self._config.get(full_key, '') - - channel = create_channel(ch_type, ch_config) - if channel: - valid, err = channel.validate_config() - if valid: - self._channels[ch_type] = channel - else: - print(f"[NotificationManager] Channel {ch_type} invalid: {err}") - - def reload_config(self): - """Reload config from DB without restarting.""" - with self._lock: - self._load_config() - return {'success': True, 'channels': list(self._channels.keys())} - - # ─── Server Mode (Background) ────────────────────────────── - - def start(self): - """Start the notification service in server mode. - - Launches watchers and dispatch loop as daemon threads. - Called by flask_server.py on startup. - """ - if self._running: - return - - self._load_config() - self._load_cooldowns_from_db() - - if not self._enabled: - print("[NotificationManager] Service is disabled. Skipping start.") - return - - self._running = True - self._stats['started_at'] = datetime.now().isoformat() - - # Ensure PVE webhook is configured (repairs priv config if missing) - try: - from flask_notification_routes import setup_pve_webhook_core - wh_result = setup_pve_webhook_core() - if wh_result.get('configured'): - print("[NotificationManager] PVE webhook configured OK.") - elif wh_result.get('error'): - print(f"[NotificationManager] PVE webhook warning: {wh_result['error']}") - except ImportError: - pass # flask_notification_routes not loaded yet (early startup) - except Exception as e: - print(f"[NotificationManager] PVE webhook setup error: {e}") - - # Start event watchers - self._journal_watcher = JournalWatcher(self._event_queue) - self._task_watcher = TaskWatcher(self._event_queue) - self._polling_collector = PollingCollector(self._event_queue) - - self._journal_watcher.start() - self._task_watcher.start() - self._polling_collector.start() - - # Start dispatch loop - self._dispatch_thread = threading.Thread( - target=self._dispatch_loop, daemon=True, name='notification-dispatch' - ) - self._dispatch_thread.start() - - print(f"[NotificationManager] Started with channels: {list(self._channels.keys())}") - - def stop(self): - """Stop the notification service cleanly.""" - self._running = False - - if self._journal_watcher: - self._journal_watcher.stop() - if self._task_watcher: - self._task_watcher.stop() - if self._polling_collector: - self._polling_collector.stop() - - print("[NotificationManager] Stopped.") - - def _dispatch_loop(self): - """Main dispatch loop: reads queue -> filters -> formats -> sends -> records.""" - last_cleanup = time.monotonic() - last_flush = time.monotonic() - cleanup_interval = 3600 # Cleanup cooldowns every hour - flush_interval = 5 # Flush aggregation buckets every 5s - - while self._running: - try: - event = self._event_queue.get(timeout=2) - except Empty: - # Periodic maintenance during idle - now_mono = time.monotonic() - if now_mono - last_cleanup > cleanup_interval: - self._cleanup_old_cooldowns() - last_cleanup = now_mono - # Flush expired aggregation buckets - if now_mono - last_flush > flush_interval: - self._flush_aggregation() - last_flush = now_mono - continue - - try: - self._process_event(event) - except Exception as e: - print(f"[NotificationManager] Dispatch error: {e}") - - # Also flush aggregation after each event - if time.monotonic() - last_flush > flush_interval: - self._flush_aggregation() - last_flush = time.monotonic() - - def _flush_aggregation(self): - """Flush expired aggregation buckets and dispatch summaries.""" - try: - summaries = self._aggregator.flush_expired() - for summary_event in summaries: - # Burst summaries bypass aggregator but still pass cooldown + rate limit - self._process_event_direct(summary_event) - except Exception as e: - print(f"[NotificationManager] Aggregation flush error: {e}") - - def _process_event(self, event: NotificationEvent): - """Process a single event: filter -> aggregate -> cooldown -> rate limit -> dispatch.""" - if not self._enabled: - return - - # Check if this event's GROUP is enabled in settings. - # The UI saves categories by group key: events.vm_ct, events.backup, etc. - template = TEMPLATES.get(event.event_type, {}) - event_group = template.get('group', 'system') - group_setting = f'events.{event_group}' - if self._config.get(group_setting, 'true') == 'false': - return - - # Check if this SPECIFIC event type is enabled (granular per-event toggle). - # Key format: event.{event_type} = "true"/"false" - # Default comes from the template's default_enabled field. - default_enabled = 'true' if template.get('default_enabled', True) else 'false' - event_specific = f'event.{event.event_type}' - if self._config.get(event_specific, default_enabled) == 'false': - return - - # Check severity filter. - # The UI saves severity_filter as: "all", "warning", "critical". - # Map to our internal severity names for comparison. - severity_map = {'all': 'INFO', 'warning': 'WARNING', 'critical': 'CRITICAL'} - raw_filter = self._config.get('severity_filter', 'all') - min_severity = severity_map.get(raw_filter.lower(), 'INFO') - if not self._meets_severity(event.severity, min_severity): - return - - # Try aggregation (may buffer the event) - result = self._aggregator.ingest(event) - if result is None: - return # Buffered, will be flushed as summary later - event = result # Use original event (first in burst passes through) - - # From here, proceed with dispatch (shared with _process_event_direct) - self._dispatch_event(event) - - def _process_event_direct(self, event: NotificationEvent): - """Process a burst summary event. Bypasses aggregator but applies ALL other filters.""" - if not self._enabled: - return - - # Check group filter (same as _process_event) - template = TEMPLATES.get(event.event_type, {}) - event_group = template.get('group', 'system') - group_setting = f'events.{event_group}' - if self._config.get(group_setting, 'true') == 'false': - return - - # Check per-event filter (same as _process_event) - default_enabled = 'true' if template.get('default_enabled', True) else 'false' - event_specific = f'event.{event.event_type}' - if self._config.get(event_specific, default_enabled) == 'false': - return - - # Check severity filter (same mapping as _process_event) - severity_map = {'all': 'INFO', 'warning': 'WARNING', 'critical': 'CRITICAL'} - raw_filter = self._config.get('severity_filter', 'all') - min_severity = severity_map.get(raw_filter.lower(), 'INFO') - if not self._meets_severity(event.severity, min_severity): - return - - self._dispatch_event(event) - - def _dispatch_event(self, event: NotificationEvent): - """Shared dispatch pipeline: cooldown -> rate limit -> render -> send.""" - # Check cooldown - if not self._check_cooldown(event): - return - - # Check group rate limit - template = TEMPLATES.get(event.event_type, {}) - group = template.get('group', 'system') - if not self._group_limiter.allow(group): - return - - # Use the properly mapped severity from the event, not from template defaults. - # event.severity was set by _map_severity which normalises to CRITICAL/WARNING/INFO. - severity = event.severity - - # Inject the canonical severity into data so templates see it too. - event.data['severity'] = severity - - # Render message from template (structured output) - rendered = render_template(event.event_type, event.data) - - # Optional AI enhancement (on text body only) - ai_config = { - 'enabled': self._config.get('ai_enabled', 'false'), - 'provider': self._config.get('ai_provider', ''), - 'api_key': self._config.get('ai_api_key', ''), - 'model': self._config.get('ai_model', ''), - } - body = format_with_ai( - rendered['title'], rendered['body'], severity, ai_config - ) - - # Enrich data with structured fields for channels that support them - enriched_data = dict(event.data) - enriched_data['_rendered_fields'] = rendered.get('fields', []) - enriched_data['_body_html'] = rendered.get('body_html', '') - - # Send through all active channels - self._dispatch_to_channels( - rendered['title'], body, severity, - event.event_type, enriched_data, event.source - ) - - def _dispatch_to_channels(self, title: str, body: str, severity: str, - event_type: str, data: Dict, source: str): - """Send notification through all configured channels.""" - with self._lock: - channels = dict(self._channels) - - for ch_name, channel in channels.items(): - try: - result = channel.send(title, body, severity, data) - self._record_history( - event_type, ch_name, title, body, severity, - result.get('success', False), - result.get('error', ''), - source - ) - - if result.get('success'): - self._stats['total_sent'] += 1 - self._stats['last_sent_at'] = datetime.now().isoformat() - else: - self._stats['total_errors'] += 1 - print(f"[NotificationManager] Send failed ({ch_name}): {result.get('error')}") - - except Exception as e: - self._stats['total_errors'] += 1 - self._record_history( - event_type, ch_name, title, body, severity, - False, str(e), source - ) - - # ─── Cooldown / Dedup ─────────────────────────────────────── - - def _check_cooldown(self, event: NotificationEvent) -> bool: - """Check if the event passes cooldown rules.""" - now = time.time() - - # Determine cooldown period - template = TEMPLATES.get(event.event_type, {}) - group = template.get('group', 'system') - - # Priority: per-type config > per-severity > default - cooldown_key = f'cooldown.{event.event_type}' - cooldown_str = self._config.get(cooldown_key) - - if cooldown_str is None: - cooldown_key_group = f'cooldown.{group}' - cooldown_str = self._config.get(cooldown_key_group) - - if cooldown_str is not None: - cooldown = int(cooldown_str) - else: - cooldown = DEFAULT_COOLDOWNS.get(event.severity, 300) - - # CRITICAL events: 60s minimum cooldown (prevents storm, but delivers fast) - if event.severity == 'CRITICAL' and cooldown_str is None: - cooldown = 60 - - # Backup/replication events: each execution is unique and should - # always be delivered. A 10s cooldown prevents exact duplicates - # (webhook + tasks) but allows repeated backup jobs to report. - _ALWAYS_DELIVER = {'backup_complete', 'backup_fail', 'backup_start', - 'replication_complete', 'replication_fail'} - if event.event_type in _ALWAYS_DELIVER and cooldown_str is None: - cooldown = 10 - - # VM/CT state changes are real user actions that should always be - # delivered. Each start/stop/shutdown is a distinct event. A 5s - # cooldown prevents exact duplicates from concurrent watchers. - _STATE_EVENTS = { - 'vm_start', 'vm_stop', 'vm_shutdown', 'vm_restart', - 'ct_start', 'ct_stop', 'ct_shutdown', 'ct_restart', - 'vm_fail', 'ct_fail', - } - if event.event_type in _STATE_EVENTS and cooldown_str is None: - cooldown = 5 - - # System shutdown/reboot must be delivered immediately -- the node - # is going down and there may be only seconds to send the message. - _URGENT_EVENTS = {'system_shutdown', 'system_reboot'} - if event.event_type in _URGENT_EVENTS and cooldown_str is None: - cooldown = 5 - - # Check against last sent time using stable fingerprint - last_sent = self._cooldowns.get(event.fingerprint, 0) - - if now - last_sent < cooldown: - return False - - self._cooldowns[event.fingerprint] = now - self._persist_cooldown(event.fingerprint, now) - return True - - def _load_cooldowns_from_db(self): - """Load persistent cooldown state from SQLite (up to 48h).""" - try: - if not DB_PATH.exists(): - return - conn = sqlite3.connect(str(DB_PATH), timeout=10) - conn.execute('PRAGMA journal_mode=WAL') - cursor = conn.cursor() - cursor.execute('SELECT fingerprint, last_sent_ts FROM notification_last_sent') - now = time.time() - for fp, ts in cursor.fetchall(): - if now - ts < 172800: # 48h window - self._cooldowns[fp] = ts - conn.close() - except Exception as e: - print(f"[NotificationManager] Failed to load cooldowns: {e}") - - def _persist_cooldown(self, fingerprint: str, ts: float): - """Save cooldown timestamp to SQLite for restart persistence.""" - try: - conn = sqlite3.connect(str(DB_PATH), timeout=10) - conn.execute('PRAGMA journal_mode=WAL') - conn.execute('PRAGMA busy_timeout=5000') - conn.execute(''' - INSERT OR REPLACE INTO notification_last_sent (fingerprint, last_sent_ts, count) - VALUES (?, ?, COALESCE( - (SELECT count + 1 FROM notification_last_sent WHERE fingerprint = ?), 1 - )) - ''', (fingerprint, int(ts), fingerprint)) - conn.commit() - conn.close() - except Exception: - pass # Non-critical, in-memory cooldown still works - - def _cleanup_old_cooldowns(self): - """Remove cooldown entries older than 48h from both memory and DB.""" - cutoff = time.time() - 172800 # 48h - self._cooldowns = {k: v for k, v in self._cooldowns.items() if v > cutoff} - try: - conn = sqlite3.connect(str(DB_PATH), timeout=10) - conn.execute('PRAGMA journal_mode=WAL') - conn.execute('DELETE FROM notification_last_sent WHERE last_sent_ts < ?', (int(cutoff),)) - conn.commit() - conn.close() - except Exception: - pass - - @staticmethod - def _meets_severity(event_severity: str, min_severity: str) -> bool: - """Check if event severity meets the minimum threshold.""" - levels = {'INFO': 0, 'WARNING': 1, 'CRITICAL': 2} - return levels.get(event_severity, 0) >= levels.get(min_severity, 0) - - # ─── History Recording ────────────────────────────────────── - - def _record_history(self, event_type: str, channel: str, title: str, - message: str, severity: str, success: bool, - error_message: str, source: str): - """Record a notification attempt in the history table.""" - try: - conn = sqlite3.connect(str(DB_PATH), timeout=10) - conn.execute('PRAGMA journal_mode=WAL') - conn.execute('PRAGMA busy_timeout=5000') - cursor = conn.cursor() - cursor.execute(''' - INSERT INTO notification_history - (event_type, channel, title, message, severity, sent_at, success, error_message, source) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) - ''', ( - event_type, channel, title, message[:500], severity, - datetime.now().isoformat(), 1 if success else 0, - error_message[:500] if error_message else None, source - )) - conn.commit() - conn.close() - except Exception as e: - print(f"[NotificationManager] History record error: {e}") - - # ─── Public API (used by Flask routes and CLI) ────────────── - - def send_notification(self, event_type: str, severity: str, - title: str, message: str, - data: Optional[Dict] = None, - source: str = 'api') -> Dict[str, Any]: - """Send a notification directly (bypasses queue and cooldown). - - Used by CLI and API for explicit sends. - """ - if not self._channels: - self._load_config() - - if not self._channels: - return { - 'success': False, - 'error': 'No channels configured or enabled', - 'channels_sent': [], - } - - # Render template if available - if event_type in TEMPLATES and not message: - rendered = render_template(event_type, data or {}) - title = title or rendered['title'] - message = rendered['body'] - severity = severity or rendered['severity'] - - # AI enhancement - ai_config = { - 'enabled': self._config.get('ai_enabled', 'false'), - 'provider': self._config.get('ai_provider', ''), - 'api_key': self._config.get('ai_api_key', ''), - 'model': self._config.get('ai_model', ''), - } - message = format_with_ai(title, message, severity, ai_config) - - results = {} - channels_sent = [] - errors = [] - - with self._lock: - channels = dict(self._channels) - - for ch_name, channel in channels.items(): - try: - result = channel.send(title, message, severity, data) - results[ch_name] = result - - self._record_history( - event_type, ch_name, title, message, severity, - result.get('success', False), - result.get('error', ''), - source - ) - - if result.get('success'): - channels_sent.append(ch_name) - else: - errors.append(f"{ch_name}: {result.get('error')}") - except Exception as e: - errors.append(f"{ch_name}: {str(e)}") - - return { - 'success': len(channels_sent) > 0, - 'channels_sent': channels_sent, - 'errors': errors, - 'total_channels': len(channels), - } - - def send_raw(self, title: str, message: str, - severity: str = 'INFO', - source: str = 'api') -> Dict[str, Any]: - """Send a raw message without template (for custom scripts).""" - return self.send_notification( - 'custom', severity, title, message, source=source - ) - - def test_channel(self, channel_name: str = 'all') -> Dict[str, Any]: - """Test one or all configured channels.""" - if not self._channels: - self._load_config() - - if not self._channels: - return {'success': False, 'error': 'No channels configured'} - - results = {} - - if channel_name == 'all': - targets = dict(self._channels) - elif channel_name in self._channels: - targets = {channel_name: self._channels[channel_name]} - else: - # Try to create channel from config even if not enabled - ch_config = {} - for config_key in CHANNEL_TYPES.get(channel_name, {}).get('config_keys', []): - ch_config[config_key] = self._config.get(f'{channel_name}.{config_key}', '') - - channel = create_channel(channel_name, ch_config) - if channel: - targets = {channel_name: channel} - else: - return {'success': False, 'error': f'Channel {channel_name} not configured'} - - for ch_name, channel in targets.items(): - success, error = channel.test() - results[ch_name] = {'success': success, 'error': error} - - self._record_history( - 'test', ch_name, 'ProxMenux Test', - 'Test notification', 'INFO', - success, error, 'api' - ) - - overall_success = any(r['success'] for r in results.values()) - return { - 'success': overall_success, - 'results': results, - } - - # ─── Proxmox Webhook ────────────────────────────────────────── - - def process_webhook(self, payload: dict) -> dict: - """Process incoming Proxmox webhook. Delegates to ProxmoxHookWatcher.""" - if not self._hook_watcher: - self._hook_watcher = ProxmoxHookWatcher(self._event_queue) - return self._hook_watcher.process_webhook(payload) - - def get_webhook_secret(self) -> str: - """Get configured webhook secret, or empty string if none.""" - if not self._config: - self._load_config() - return self._config.get('webhook_secret', '') - - def get_webhook_allowed_ips(self) -> list: - """Get list of allowed IPs for webhook, or empty list (allow all).""" - if not self._config: - self._load_config() - raw = self._config.get('webhook_allowed_ips', '') - if not raw: - return [] - return [ip.strip() for ip in str(raw).split(',') if ip.strip()] - - # ─── Status & Settings ────────────────────────────────────── - - def get_status(self) -> Dict[str, Any]: - """Get current service status.""" - if not self._config: - self._load_config() - - return { - 'enabled': self._enabled, - 'running': self._running, - 'channels': { - name: { - 'type': name, - 'connected': True, - } - for name in self._channels - }, - 'stats': self._stats, - 'watchers': { - 'journal': self._journal_watcher is not None and self._running, - 'task': self._task_watcher is not None and self._running, - 'polling': self._polling_collector is not None and self._running, - }, - } - - def set_enabled(self, enabled: bool) -> Dict[str, Any]: - """Enable or disable the notification service.""" - self._save_setting('enabled', 'true' if enabled else 'false') - self._enabled = enabled - - if enabled and not self._running: - self.start() - elif not enabled and self._running: - self.stop() - - return {'success': True, 'enabled': enabled} - - def list_channels(self) -> Dict[str, Any]: - """List all channel types with their configuration status.""" - if not self._config: - self._load_config() - - channels_info = {} - for ch_type, info in CHANNEL_TYPES.items(): - enabled = self._config.get(f'{ch_type}.enabled', 'false') == 'true' - configured = all( - bool(self._config.get(f'{ch_type}.{k}', '')) - for k in info['config_keys'] - ) - channels_info[ch_type] = { - 'name': info['name'], - 'enabled': enabled, - 'configured': configured, - 'active': ch_type in self._channels, - } - - return {'channels': channels_info} - - def get_history(self, limit: int = 50, offset: int = 0, - severity: str = '', channel: str = '') -> Dict[str, Any]: - """Get notification history with optional filters.""" - try: - conn = sqlite3.connect(str(DB_PATH), timeout=10) - conn.execute('PRAGMA journal_mode=WAL') - conn.execute('PRAGMA busy_timeout=5000') - conn.row_factory = sqlite3.Row - cursor = conn.cursor() - - query = 'SELECT * FROM notification_history WHERE 1=1' - params: list = [] - - if severity: - query += ' AND severity = ?' - params.append(severity) - if channel: - query += ' AND channel = ?' - params.append(channel) - - query += ' ORDER BY sent_at DESC LIMIT ? OFFSET ?' - params.extend([limit, offset]) - - cursor.execute(query, params) - rows = [dict(row) for row in cursor.fetchall()] - - # Get total count - count_query = 'SELECT COUNT(*) FROM notification_history WHERE 1=1' - count_params: list = [] - if severity: - count_query += ' AND severity = ?' - count_params.append(severity) - if channel: - count_query += ' AND channel = ?' - count_params.append(channel) - - cursor.execute(count_query, count_params) - total = cursor.fetchone()[0] - - conn.close() - - return { - 'history': rows, - 'total': total, - 'limit': limit, - 'offset': offset, - } - except Exception as e: - return {'history': [], 'total': 0, 'error': str(e)} - - def clear_history(self) -> Dict[str, Any]: - """Clear all notification history.""" - try: - conn = sqlite3.connect(str(DB_PATH), timeout=10) - conn.execute('PRAGMA journal_mode=WAL') - conn.execute('PRAGMA busy_timeout=5000') - conn.execute('DELETE FROM notification_history') - conn.commit() - conn.close() - return {'success': True} - except Exception as e: - return {'success': False, 'error': str(e)} - - def get_settings(self) -> Dict[str, Any]: - """Get all notification settings for the UI. - - Returns a structure matching the frontend's NotificationConfig shape - so the round-trip (GET -> edit -> POST) is seamless. - """ - if not self._config: - self._load_config() - - # Build nested channels object matching frontend ChannelConfig - channels = {} - for ch_type, info in CHANNEL_TYPES.items(): - ch_cfg: Dict[str, Any] = { - 'enabled': self._config.get(f'{ch_type}.enabled', 'false') == 'true', - } - for config_key in info['config_keys']: - ch_cfg[config_key] = self._config.get(f'{ch_type}.{config_key}', '') - channels[ch_type] = ch_cfg - - # Build event_categories dict (group-level toggle) - # EVENT_GROUPS is a dict: { 'system': {...}, 'vm_ct': {...}, ... } - event_categories = {} - for group_key in EVENT_GROUPS: - event_categories[group_key] = self._config.get(f'events.{group_key}', 'true') == 'true' - - # Build per-event toggles: { 'vm_start': true, 'vm_stop': false, ... } - event_toggles = {} - for event_type, tmpl in TEMPLATES.items(): - default = tmpl.get('default_enabled', True) - saved = self._config.get(f'event.{event_type}', None) - if saved is not None: - event_toggles[event_type] = saved == 'true' - else: - event_toggles[event_type] = default - - # Build event_types_by_group for UI rendering - event_types_by_group = get_event_types_by_group() - - config = { - 'enabled': self._enabled, - 'channels': channels, - 'severity_filter': self._config.get('severity_filter', 'all'), - 'event_categories': event_categories, - 'event_toggles': event_toggles, - 'event_types_by_group': event_types_by_group, - 'ai_enabled': self._config.get('ai_enabled', 'false') == 'true', - 'ai_provider': self._config.get('ai_provider', 'openai'), - 'ai_api_key': self._config.get('ai_api_key', ''), - 'ai_model': self._config.get('ai_model', ''), - 'hostname': self._config.get('hostname', ''), - 'webhook_secret': self._config.get('webhook_secret', ''), - 'webhook_allowed_ips': self._config.get('webhook_allowed_ips', ''), - 'pbs_host': self._config.get('pbs_host', ''), - 'pve_host': self._config.get('pve_host', ''), - 'pbs_trusted_sources': self._config.get('pbs_trusted_sources', ''), - } - - return { - 'success': True, - 'config': config, - } - - def save_settings(self, settings: Dict[str, str]) -> Dict[str, Any]: - """Save multiple notification settings at once.""" - try: - conn = sqlite3.connect(str(DB_PATH), timeout=10) - conn.execute('PRAGMA journal_mode=WAL') - conn.execute('PRAGMA busy_timeout=5000') - cursor = conn.cursor() - now = datetime.now().isoformat() - - for key, value in settings.items(): - # Accept both prefixed and unprefixed keys - full_key = key if key.startswith(SETTINGS_PREFIX) else f'{SETTINGS_PREFIX}{key}' - short_key = full_key[len(SETTINGS_PREFIX):] - - cursor.execute(''' - INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at) - VALUES (?, ?, ?) - ''', (full_key, str(value), now)) - - self._config[short_key] = str(value) - - # If user is explicitly enabling an event that defaults to disabled, - # mark it so _load_config reconciliation won't override it later. - if short_key.startswith('event.') and str(value) == 'true': - event_type = short_key[6:] # strip 'event.' - tmpl = TEMPLATES.get(event_type, {}) - if not tmpl.get('default_enabled', True): - marker_key = f'{SETTINGS_PREFIX}event_explicit.{event_type}' - cursor.execute(''' - INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at) - VALUES (?, ?, ?) - ''', (marker_key, 'true', now)) - self._config[f'event_explicit.{event_type}'] = 'true' - - conn.commit() - conn.close() - - # Rebuild channels with new config - was_enabled = self._enabled - self._enabled = self._config.get('enabled', 'false') == 'true' - self._rebuild_channels() - - # Start/stop service and auto-configure PVE webhook - pve_webhook_result = None - if self._enabled and not was_enabled: - # Notifications just got ENABLED -> start service + setup PVE webhook - if not self._running: - self.start() - try: - from flask_notification_routes import setup_pve_webhook_core - pve_webhook_result = setup_pve_webhook_core() - except ImportError: - pass # flask_notification_routes not available (CLI mode) - except Exception as e: - pve_webhook_result = {'configured': False, 'error': str(e)} - elif not self._enabled and was_enabled: - # Notifications just got DISABLED -> stop service + cleanup PVE webhook - if self._running: - self.stop() - try: - from flask_notification_routes import cleanup_pve_webhook_core - cleanup_pve_webhook_core() - except ImportError: - pass - except Exception: - pass - - result = {'success': True, 'channels_active': list(self._channels.keys())} - if pve_webhook_result: - result['pve_webhook'] = pve_webhook_result - return result - except Exception as e: - return {'success': False, 'error': str(e)} - - -# ─── Singleton (for server mode) ───────────────────────────────── - -notification_manager = NotificationManager() - - -# ─── CLI Interface ──────────────────────────────────────────────── - -def _print_result(result: Dict, as_json: bool): - """Print CLI result in human-readable or JSON format.""" - if as_json: - print(json.dumps(result, indent=2, default=str)) - return - - if result.get('success'): - print(f"OK: ", end='') - elif 'success' in result and not result['success']: - print(f"ERROR: ", end='') - - # Format based on content - if 'channels_sent' in result: - sent = result.get('channels_sent', []) - print(f"Sent via: {', '.join(sent) if sent else 'none'}") - if result.get('errors'): - for err in result['errors']: - print(f" Error: {err}") - elif 'results' in result: - for ch, r in result['results'].items(): - status = 'OK' if r['success'] else f"FAILED: {r['error']}" - print(f" {ch}: {status}") - elif 'channels' in result: - for ch, info in result['channels'].items(): - status = 'active' if info.get('active') else ('configured' if info.get('configured') else 'not configured') - enabled = 'enabled' if info.get('enabled') else 'disabled' - print(f" {info['name']}: {enabled}, {status}") - elif 'enabled' in result and 'running' in result: - print(f"Enabled: {result['enabled']}, Running: {result['running']}") - if result.get('stats'): - stats = result['stats'] - print(f" Total sent: {stats.get('total_sent', 0)}") - print(f" Total errors: {stats.get('total_errors', 0)}") - if stats.get('last_sent_at'): - print(f" Last sent: {stats['last_sent_at']}") - elif 'enabled' in result: - print(f"Service {'enabled' if result['enabled'] else 'disabled'}") - else: - print(json.dumps(result, indent=2, default=str)) - - -if __name__ == '__main__': - import argparse - - parser = argparse.ArgumentParser( - description='ProxMenux Notification Manager CLI', - epilog='Example: python3 notification_manager.py --action send --type vm_fail --severity CRITICAL --title "VM 100 failed" --message "QEMU process crashed"' - ) - parser.add_argument('--action', required=True, - choices=['send', 'send-raw', 'test', 'status', - 'enable', 'disable', 'list-channels'], - help='Action to perform') - parser.add_argument('--type', help='Event type for send action (e.g. vm_fail, backup_complete)') - parser.add_argument('--severity', default='INFO', - choices=['INFO', 'WARNING', 'CRITICAL'], - help='Notification severity (default: INFO)') - parser.add_argument('--title', help='Notification title') - parser.add_argument('--message', help='Notification message body') - parser.add_argument('--channel', default='all', - help='Specific channel for test (default: all)') - parser.add_argument('--json', action='store_true', - help='Output result as JSON') - - args = parser.parse_args() - - mgr = NotificationManager() - mgr._load_config() - - if args.action == 'send': - if not args.type: - parser.error('--type is required for send action') - result = mgr.send_notification( - args.type, args.severity, - args.title or '', args.message or '', - data={ - 'hostname': socket.gethostname().split('.')[0], - 'reason': args.message or '', - }, - source='cli' - ) - - elif args.action == 'send-raw': - if not args.title or not args.message: - parser.error('--title and --message are required for send-raw') - result = mgr.send_raw(args.title, args.message, args.severity, source='cli') - - elif args.action == 'test': - result = mgr.test_channel(args.channel) - - elif args.action == 'status': - result = mgr.get_status() - - elif args.action == 'enable': - result = mgr.set_enabled(True) - - elif args.action == 'disable': - result = mgr.set_enabled(False) - - elif args.action == 'list-channels': - result = mgr.list_channels() - - else: - result = {'error': f'Unknown action: {args.action}'} - - _print_result(result, args.json) - - # Exit with appropriate code - sys.exit(0 if result.get('success', True) else 1) diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py deleted file mode 100644 index 55371f45..00000000 --- a/AppImage/scripts/notification_templates.py +++ /dev/null @@ -1,958 +0,0 @@ -""" -ProxMenux Notification Templates -Message templates for all event types with per-channel formatting. - -Templates use Python str.format() variables: - {hostname}, {severity}, {category}, {reason}, {summary}, - {previous}, {current}, {vmid}, {vmname}, {timestamp}, etc. - -Optional AI enhancement enriches messages with context/suggestions. - -Author: MacRimi -""" - -import json -import re -import socket -import time -import urllib.request -import urllib.error -from typing import Dict, Any, Optional, List - - -# ─── vzdump message parser ─────────────────────────────────────── - -def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]: - """Parse a PVE vzdump notification message into structured data. - - Supports two formats: - 1. Local storage: table with columns VMID Name Status Time Size Filename - 2. PBS storage: log-style output with 'Finished Backup of VM NNN (HH:MM:SS)' - and sizes in lines like 'root.pxar: had to backup X of Y' or 'transferred X' - - Returns dict with 'vms' list, 'total_time', 'total_size', or None. - """ - if not message: - return None - - vms: List[Dict[str, str]] = [] - total_time = '' - total_size = '' - - lines = message.split('\n') - - # ── Strategy 1: classic table (local/NFS/CIFS storage) ── - header_idx = -1 - for i, line in enumerate(lines): - if re.match(r'\s*VMID\s+Name\s+Status', line, re.IGNORECASE): - header_idx = i - break - - if header_idx >= 0: - # Use column positions from the header to slice each row. - # Header: "VMID Name Status Time Size Filename" - header = lines[header_idx] - col_starts = [] - for col_name in ['VMID', 'Name', 'Status', 'Time', 'Size', 'Filename']: - idx = header.find(col_name) - if idx >= 0: - col_starts.append(idx) - - if len(col_starts) == 6: - for line in lines[header_idx + 1:]: - stripped = line.strip() - if not stripped or stripped.startswith('Total') or stripped.startswith('Logs') or stripped.startswith('='): - break - # Pad line to avoid index errors - padded = line.ljust(col_starts[-1] + 50) - vmid = padded[col_starts[0]:col_starts[1]].strip() - name = padded[col_starts[1]:col_starts[2]].strip() - status = padded[col_starts[2]:col_starts[3]].strip() - time_val = padded[col_starts[3]:col_starts[4]].strip() - size = padded[col_starts[4]:col_starts[5]].strip() - filename = padded[col_starts[5]:].strip() - - if vmid and vmid.isdigit(): - vms.append({ - 'vmid': vmid, - 'name': name, - 'status': status, - 'time': time_val, - 'size': size, - 'filename': filename, - }) - - # ── Strategy 2: log-style (PBS / Proxmox Backup Server) ── - # Parse from the full vzdump log lines. - # Look for patterns: - # "Starting Backup of VM NNN (lxc/qemu)" -> detect guest - # "CT Name: xxx" or "VM Name: xxx" -> guest name - # "Finished Backup of VM NNN (HH:MM:SS)" -> duration + status=ok - # "root.pxar: had to backup X of Y" -> size (CT) - # "transferred X in N seconds" -> size (QEMU) - # "creating ... archive 'ct/100/2026-..'" -> archive name for PBS - # "TASK ERROR:" or "ERROR:" -> status=error - if not vms: - current_vm: Optional[Dict[str, str]] = None - - for line in lines: - # Remove "INFO: " prefix that PVE adds - clean = re.sub(r'^(?:INFO|WARNING|ERROR):\s*', '', line.strip()) - - # Start of a new VM backup - m_start = re.match( - r'Starting Backup of VM (\d+)\s+\((lxc|qemu)\)', clean) - if m_start: - if current_vm: - vms.append(current_vm) - current_vm = { - 'vmid': m_start.group(1), - 'name': '', - 'status': 'ok', - 'time': '', - 'size': '', - 'filename': '', - 'type': m_start.group(2), - } - continue - - if current_vm: - # Guest name - m_name = re.match(r'(?:CT|VM) Name:\s*(.+)', clean) - if m_name: - current_vm['name'] = m_name.group(1).strip() - continue - - # PBS archive path -> extract as filename - m_archive = re.search( - r"creating .+ archive '([^']+)'", clean) - if m_archive: - current_vm['filename'] = m_archive.group(1) - continue - - # Size for containers (pxar) - m_pxar = re.search( - r'root\.pxar:.*?of\s+([\d.]+\s+\S+)', clean) - if m_pxar: - current_vm['size'] = m_pxar.group(1) - continue - - # Size for QEMU (transferred) - m_transfer = re.search( - r'transferred\s+([\d.]+\s+\S+)', clean) - if m_transfer: - current_vm['size'] = m_transfer.group(1) - continue - - # Finished -> duration - m_finish = re.match( - r'Finished Backup of VM (\d+)\s+\(([^)]+)\)', clean) - if m_finish: - current_vm['time'] = m_finish.group(2) - current_vm['status'] = 'ok' - vms.append(current_vm) - current_vm = None - continue - - # Error - if clean.startswith('ERROR:') or clean.startswith('TASK ERROR'): - if current_vm: - current_vm['status'] = 'error' - - # Don't forget the last VM if it wasn't finished - if current_vm: - vms.append(current_vm) - - # ── Extract totals ── - for line in lines: - m_time = re.search(r'Total running time:\s*(.+)', line) - if m_time: - total_time = m_time.group(1).strip() - m_size = re.search(r'Total size:\s*(.+)', line) - if m_size: - total_size = m_size.group(1).strip() - - # For PBS: calculate total size if not explicitly stated - if not total_size and vms: - # Sum individual sizes if they share units - sizes_gib = 0.0 - for vm in vms: - s = vm.get('size', '') - m = re.match(r'([\d.]+)\s+(.*)', s) - if m: - val = float(m.group(1)) - unit = m.group(2).strip().upper() - if 'GIB' in unit or 'GB' in unit: - sizes_gib += val - elif 'MIB' in unit or 'MB' in unit: - sizes_gib += val / 1024 - elif 'TIB' in unit or 'TB' in unit: - sizes_gib += val * 1024 - if sizes_gib > 0: - if sizes_gib >= 1024: - total_size = f"{sizes_gib / 1024:.3f} TiB" - elif sizes_gib >= 1: - total_size = f"{sizes_gib:.3f} GiB" - else: - total_size = f"{sizes_gib * 1024:.3f} MiB" - - # For PBS: calculate total time if not stated - if not total_time and vms: - total_secs = 0 - for vm in vms: - t = vm.get('time', '') - # Parse HH:MM:SS format - m = re.match(r'(\d+):(\d+):(\d+)', t) - if m: - total_secs += int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3)) - if total_secs > 0: - hours = total_secs // 3600 - mins = (total_secs % 3600) // 60 - secs = total_secs % 60 - if hours: - total_time = f"{hours}h {mins}m {secs}s" - elif mins: - total_time = f"{mins}m {secs}s" - else: - total_time = f"{secs}s" - - if not vms and not total_size: - return None - - return { - 'vms': vms, - 'total_time': total_time, - 'total_size': total_size, - 'vm_count': len(vms), - } - - -def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str: - """Format parsed vzdump data into a clean Telegram-friendly message.""" - parts = [] - - for vm in parsed.get('vms', []): - status = vm.get('status', '').lower() - icon = '\u2705' if status == 'ok' else '\u274C' - - parts.append(f"{icon} ID {vm['vmid']} ({vm['name']})") - - details = [] - if vm.get('size'): - details.append(f"Size: {vm['size']}") - if vm.get('time'): - details.append(f"Duration: {vm['time']}") - if vm.get('filename'): - fname = vm['filename'] - # PBS archives look like "ct/100/2026-..." or "vm/105/2026-..." - if re.match(r'^(?:ct|vm)/\d+/', fname): - details.append(f"PBS: {fname}") - else: - details.append(f"File: {fname}") - if details: - parts.append(' | '.join(details)) - parts.append('') # blank line between VMs - - # Summary - vm_count = parsed.get('vm_count', 0) - if vm_count > 0 or parsed.get('total_size'): - ok_count = sum(1 for v in parsed.get('vms', []) - if v.get('status', '').lower() == 'ok') - fail_count = vm_count - ok_count - - summary_parts = [] - if vm_count: - summary_parts.append(f"{vm_count} backup(s)") - if fail_count: - summary_parts.append(f"{fail_count} failed") - if parsed.get('total_size'): - summary_parts.append(f"Total: {parsed['total_size']}") - if parsed.get('total_time'): - summary_parts.append(f"Time: {parsed['total_time']}") - - if summary_parts: - parts.append('--- ' + ' | '.join(summary_parts)) - - return '\n'.join(parts) - - -# ─── Severity Icons ────────────────────────────────────────────── - -SEVERITY_ICONS = { - 'CRITICAL': '\U0001F534', - 'WARNING': '\U0001F7E1', - 'INFO': '\U0001F535', - 'OK': '\U0001F7E2', - 'UNKNOWN': '\u26AA', -} - -SEVERITY_ICONS_DISCORD = { - 'CRITICAL': ':red_circle:', - 'WARNING': ':yellow_circle:', - 'INFO': ':blue_circle:', - 'OK': ':green_circle:', - 'UNKNOWN': ':white_circle:', -} - - -# ─── Event Templates ───────────────────────────────────────────── -# Each template has a 'title' and 'body' with {variable} placeholders. -# 'group' is used for UI event filter grouping. -# 'default_enabled' controls initial state in settings. - -TEMPLATES = { - # ── Health Monitor state changes ── - # NOTE: state_change is disabled by default -- it fires on every - # status oscillation (OK->WARNING->OK) which creates noise. - # The health_persistent and new_error templates cover this better. - 'state_change': { - 'title': '{hostname}: {category} changed to {current}', - 'body': '{category} status changed from {previous} to {current}.\n{reason}', - 'group': 'system', - 'default_enabled': False, - }, - 'new_error': { - 'title': '{hostname}: New {severity} - {category}', - 'body': '{reason}', - 'group': 'system', - 'default_enabled': True, - }, - 'error_resolved': { - 'title': '{hostname}: Resolved - {category}', - 'body': '{reason}\nDuration: {duration}', - 'group': 'system', - 'default_enabled': True, - }, - 'error_escalated': { - 'title': '{hostname}: Escalated to {severity} - {category}', - 'body': '{reason}', - 'group': 'system', - 'default_enabled': True, - }, - - # ── VM / CT events ── - 'vm_start': { - 'title': '{hostname}: VM {vmid} started', - 'body': '{vmname} ({vmid}) has been started.', - 'group': 'vm_ct', - 'default_enabled': True, - }, - 'vm_stop': { - 'title': '{hostname}: VM {vmid} stopped', - 'body': '{vmname} ({vmid}) has been stopped.', - 'group': 'vm_ct', - 'default_enabled': False, - }, - 'vm_shutdown': { - 'title': '{hostname}: VM {vmid} shutdown', - 'body': '{vmname} ({vmid}) has been shut down.', - 'group': 'vm_ct', - 'default_enabled': False, - }, - 'vm_fail': { - 'title': '{hostname}: VM {vmid} FAILED', - 'body': '{vmname} ({vmid}) has failed.\n{reason}', - 'group': 'vm_ct', - 'default_enabled': True, - }, - 'vm_restart': { - 'title': '{hostname}: VM {vmid} restarted', - 'body': '{vmname} ({vmid}) has been restarted.', - 'group': 'vm_ct', - 'default_enabled': False, - }, - 'ct_start': { - 'title': '{hostname}: CT {vmid} started', - 'body': '{vmname} ({vmid}) has been started.', - 'group': 'vm_ct', - 'default_enabled': True, - }, - 'ct_stop': { - 'title': '{hostname}: CT {vmid} stopped', - 'body': '{vmname} ({vmid}) has been stopped.', - 'group': 'vm_ct', - 'default_enabled': False, - }, - 'ct_shutdown': { - 'title': '{hostname}: CT {vmid} shutdown', - 'body': '{vmname} ({vmid}) has been shut down.', - 'group': 'vm_ct', - 'default_enabled': False, - }, - 'ct_restart': { - 'title': '{hostname}: CT {vmid} restarted', - 'body': '{vmname} ({vmid}) has been restarted.', - 'group': 'vm_ct', - 'default_enabled': False, - }, - 'ct_fail': { - 'title': '{hostname}: CT {vmid} FAILED', - 'body': '{vmname} ({vmid}) has failed.\n{reason}', - 'group': 'vm_ct', - 'default_enabled': True, - }, - 'migration_start': { - 'title': '{hostname}: Migration started - {vmid}', - 'body': '{vmname} ({vmid}) migration to {target_node} started.', - 'group': 'vm_ct', - 'default_enabled': True, - }, - 'migration_complete': { - 'title': '{hostname}: Migration complete - {vmid}', - 'body': '{vmname} ({vmid}) migrated successfully to {target_node}.', - 'group': 'vm_ct', - 'default_enabled': True, - }, - 'migration_fail': { - 'title': '{hostname}: Migration FAILED - {vmid}', - 'body': '{vmname} ({vmid}) migration to {target_node} failed.\n{reason}', - 'group': 'vm_ct', - 'default_enabled': True, - }, - 'replication_fail': { - 'title': '{hostname}: Replication FAILED - {vmid}', - 'body': 'Replication of {vmname} ({vmid}) has failed.\n{reason}', - 'group': 'vm_ct', - 'default_enabled': True, - }, - 'replication_complete': { - 'title': '{hostname}: Replication complete - {vmid}', - 'body': 'Replication of {vmname} ({vmid}) completed successfully.', - 'group': 'vm_ct', - 'default_enabled': False, - }, - - # ── Backup / Snapshot events ── - 'backup_start': { - 'title': '{hostname}: Backup started - {vmid}', - 'body': 'Backup of {vmname} ({vmid}) has started.', - 'group': 'backup', - 'default_enabled': False, - }, - 'backup_complete': { - 'title': '{hostname}: Backup complete - {vmid}', - 'body': 'Backup of {vmname} ({vmid}) completed successfully.\nSize: {size}', - 'group': 'backup', - 'default_enabled': True, - }, - 'backup_fail': { - 'title': '{hostname}: Backup FAILED - {vmid}', - 'body': 'Backup of {vmname} ({vmid}) has failed.\n{reason}', - 'group': 'backup', - 'default_enabled': True, - }, - 'snapshot_complete': { - 'title': '{hostname}: Snapshot created - {vmid}', - 'body': 'Snapshot of {vmname} ({vmid}) created: {snapshot_name}', - 'group': 'backup', - 'default_enabled': False, - }, - 'snapshot_fail': { - 'title': '{hostname}: Snapshot FAILED - {vmid}', - 'body': 'Snapshot of {vmname} ({vmid}) failed.\n{reason}', - 'group': 'backup', - 'default_enabled': True, - }, - - # ── Resource events (from Health Monitor) ── - 'cpu_high': { - 'title': '{hostname}: High CPU usage ({value}%)', - 'body': 'CPU usage is at {value}% on {cores} cores.\n{details}', - 'group': 'resources', - 'default_enabled': True, - }, - 'ram_high': { - 'title': '{hostname}: High memory usage ({value}%)', - 'body': 'Memory usage: {used} / {total} ({value}%).\n{details}', - 'group': 'resources', - 'default_enabled': True, - }, - 'temp_high': { - 'title': '{hostname}: High temperature ({value}C)', - 'body': 'CPU temperature: {value}C (threshold: {threshold}C).\n{details}', - 'group': 'resources', - 'default_enabled': True, - }, - 'disk_space_low': { - 'title': '{hostname}: Low disk space on {mount}', - 'body': '{mount}: {used}% used ({available} available).', - 'group': 'storage', - 'default_enabled': True, - }, - 'disk_io_error': { - 'title': '{hostname}: Disk I/O error', - 'body': '{reason}', - 'group': 'storage', - 'default_enabled': True, - }, - 'storage_unavailable': { - 'title': '{hostname}: Storage unavailable - {storage_name}', - 'body': 'PVE storage "{storage_name}" ({storage_type}) is not available.\n{reason}', - 'group': 'storage', - 'default_enabled': True, - }, - 'load_high': { - 'title': '{hostname}: High system load ({value})', - 'body': 'System load average: {value} on {cores} cores.\n{details}', - 'group': 'resources', - 'default_enabled': True, - }, - - # ── Network events ── - 'network_down': { - 'title': '{hostname}: Network connectivity lost', - 'body': 'Network connectivity check failed.\n{reason}', - 'group': 'network', - 'default_enabled': True, - }, - 'network_latency': { - 'title': '{hostname}: High network latency ({value}ms)', - 'body': 'Latency to gateway: {value}ms (threshold: {threshold}ms).', - 'group': 'network', - 'default_enabled': False, - }, - - # ── Security events ── - 'auth_fail': { - 'title': '{hostname}: Authentication failure', - 'body': 'Failed login attempt from {source_ip}.\nUser: {username}\nService: {service}', - 'group': 'security', - 'default_enabled': True, - }, - 'ip_block': { - 'title': '{hostname}: IP blocked by Fail2Ban', - 'body': 'IP {source_ip} has been banned.\nJail: {jail}\nFailures: {failures}', - 'group': 'security', - 'default_enabled': True, - }, - 'firewall_issue': { - 'title': '{hostname}: Firewall issue detected', - 'body': '{reason}', - 'group': 'security', - 'default_enabled': True, - }, - 'user_permission_change': { - 'title': '{hostname}: User permission changed', - 'body': 'User: {username}\nChange: {change_details}', - 'group': 'security', - 'default_enabled': True, - }, - - # ── Cluster events ── - 'split_brain': { - 'title': '{hostname}: SPLIT-BRAIN detected', - 'body': 'Cluster split-brain condition detected.\nQuorum status: {quorum}', - 'group': 'cluster', - 'default_enabled': True, - }, - 'node_disconnect': { - 'title': '{hostname}: Node disconnected', - 'body': 'Node {node_name} has disconnected from the cluster.', - 'group': 'cluster', - 'default_enabled': True, - }, - 'node_reconnect': { - 'title': '{hostname}: Node reconnected', - 'body': 'Node {node_name} has reconnected to the cluster.', - 'group': 'cluster', - 'default_enabled': True, - }, - - # ── System events ── - 'system_shutdown': { - 'title': '{hostname}: System shutting down', - 'body': 'The system is shutting down.\n{reason}', - 'group': 'system', - 'default_enabled': True, - }, - 'system_reboot': { - 'title': '{hostname}: System rebooting', - 'body': 'The system is rebooting.\n{reason}', - 'group': 'system', - 'default_enabled': True, - }, - 'system_problem': { - 'title': '{hostname}: System problem detected', - 'body': '{reason}', - 'group': 'system', - 'default_enabled': True, - }, - 'service_fail': { - 'title': '{hostname}: Service failed - {service_name}', - 'body': '{reason}', - 'group': 'system', - 'default_enabled': True, - }, - 'update_available': { - 'title': '{hostname}: Updates available ({count})', - 'body': '{count} package updates are available.\n{details}', - 'group': 'system', - 'default_enabled': False, - }, - 'update_complete': { - 'title': '{hostname}: Update completed', - 'body': '{details}', - 'group': 'system', - 'default_enabled': False, - }, - - # ── Unknown persistent (from health monitor) ── - 'unknown_persistent': { - 'title': '{hostname}: Check unavailable - {category}', - 'body': 'Health check for {category} has been unavailable for 3+ cycles.\n{reason}', - 'group': 'system', - 'default_enabled': False, - }, - - # ── Persistent Health Issues (daily digest) ── - 'health_persistent': { - 'title': '{hostname}: {count} active health issue(s)', - 'body': 'The following health issues remain active:\n{issue_list}\n\nThis digest is sent once every 24 hours while issues persist.', - 'group': 'system', - 'default_enabled': True, - }, - 'health_issue_new': { - 'title': '{hostname}: New health issue - {category}', - 'body': 'New {severity} issue detected:\n{reason}', - 'group': 'system', - 'default_enabled': True, - }, - 'health_issue_resolved': { - 'title': '{hostname}: Resolved - {category}', - 'body': '{category} issue has been resolved.\n{reason}\nDuration: {duration}', - 'group': 'system', - 'default_enabled': True, - }, - - # ── Update notifications (enriched) ── - 'update_summary': { - 'title': '{hostname}: {total_count} updates available', - 'body': '{security_count} security update(s), {total_count} total.\n{package_list}', - 'group': 'system', - 'default_enabled': True, - }, - 'pve_update': { - 'title': '{hostname}: PVE update available ({version})', - 'body': 'Proxmox VE update available: {version}\n{details}', - 'group': 'system', - 'default_enabled': True, - }, - - # ── PVE webhook test ── - 'webhook_test': { - 'title': '{hostname}: Webhook test received', - 'body': 'PVE webhook connectivity test successful.\n{reason}', - 'group': 'system', - 'default_enabled': True, - }, - - # ── Burst aggregation summaries ── - 'burst_auth_fail': { - 'title': '{hostname}: {count} auth failures in {window}', - 'body': '{count} authentication failures detected in {window}.\nSources: {entity_list}', - 'group': 'security', - 'default_enabled': True, - }, - 'burst_ip_block': { - 'title': '{hostname}: Fail2Ban banned {count} IPs in {window}', - 'body': '{count} IPs banned by Fail2Ban in {window}.\nIPs: {entity_list}', - 'group': 'security', - 'default_enabled': True, - }, - 'burst_disk_io': { - 'title': '{hostname}: {count} disk I/O errors on {entity_list}', - 'body': '{count} I/O errors detected in {window}.\nDevices: {entity_list}', - 'group': 'storage', - 'default_enabled': True, - }, - 'burst_cluster': { - 'title': '{hostname}: Cluster flapping detected ({count} changes)', - 'body': 'Cluster state changed {count} times in {window}.\nNodes: {entity_list}', - 'group': 'cluster', - 'default_enabled': True, - }, - 'burst_generic': { - 'title': '{hostname}: {count} {event_type} events in {window}', - 'body': '{count} events of type {event_type} in {window}.\n{entity_list}', - 'group': 'system', - 'default_enabled': True, - }, -} - -# ─── Event Groups (for UI filtering) ───────────────────────────── - -EVENT_GROUPS = { - 'system': {'label': 'System', 'description': 'System health, services, updates'}, - 'vm_ct': {'label': 'VM / CT', 'description': 'Virtual machines and containers'}, - 'backup': {'label': 'Backup', 'description': 'Backups and snapshots'}, - 'resources': {'label': 'Resources', 'description': 'CPU, memory, temperature, load'}, - 'storage': {'label': 'Storage', 'description': 'Disk space and I/O'}, - 'network': {'label': 'Network', 'description': 'Connectivity and latency'}, - 'security': {'label': 'Security', 'description': 'Authentication, firewall, bans'}, - 'cluster': {'label': 'Cluster', 'description': 'Cluster health and quorum'}, -} - - -# ─── Template Renderer ─────────────────────────────────────────── - -def _get_hostname() -> str: - """Get short hostname for message titles.""" - try: - return socket.gethostname().split('.')[0] - except Exception: - return 'proxmox' - - -def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]: - """Render a template into a structured notification object. - - Returns structured output usable by all channels: - title, body (text), body_text, body_html (escaped), fields, tags, severity, group - """ - import html as html_mod - - template = TEMPLATES.get(event_type) - if not template: - fallback_body = data.get('message', data.get('reason', str(data))) - severity = data.get('severity', 'INFO') - return { - 'title': f"{_get_hostname()}: {event_type}", - 'body': fallback_body, 'body_text': fallback_body, - 'body_html': f'

{html_mod.escape(str(fallback_body))}

', - 'fields': [], 'tags': [severity, 'system', event_type], - 'severity': severity, 'group': 'system', - } - - # Ensure hostname is always available - variables = { - 'hostname': _get_hostname(), - 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'), - 'severity': data.get('severity', 'INFO'), - # Burst event variables - 'window': '', 'entity_list': '', - # Common defaults - 'vmid': '', 'vmname': '', 'reason': '', 'summary': '', - 'details': '', 'category': '', 'previous': '', 'current': '', - 'duration': '', 'value': '', 'threshold': '', - 'source_ip': '', 'username': '', 'service': '', 'service_name': '', - 'node_name': '', 'target_node': '', 'mount': '', 'device': '', - 'used': '', 'total': '', 'available': '', 'cores': '', - 'count': '', 'size': '', 'snapshot_name': '', 'jail': '', - 'failures': '', 'quorum': '', 'change_details': '', 'message': '', - 'security_count': '0', 'total_count': '0', 'package_list': '', - 'packages': '', 'pve_packages': '', 'version': '', - 'issue_list': '', 'error_key': '', - 'storage_name': '', 'storage_type': '', - } - variables.update(data) - - try: - title = template['title'].format(**variables) - except (KeyError, ValueError): - title = template['title'] - - # ── PVE vzdump special formatting ── - # When the event came from PVE webhook with a full vzdump message, - # parse the table/logs and format a rich body instead of the sparse template. - pve_message = data.get('pve_message', '') - pve_title = data.get('pve_title', '') - - if event_type in ('backup_complete', 'backup_fail') and pve_message: - parsed = _parse_vzdump_message(pve_message) - if parsed: - is_success = (event_type == 'backup_complete') - body_text = _format_vzdump_body(parsed, is_success) - # Use PVE's own title if available (contains hostname and status) - if pve_title: - title = pve_title - else: - # Couldn't parse -- use PVE raw message as body - body_text = pve_message.strip() - elif event_type == 'system_mail' and pve_message: - # System mail -- use PVE message directly (mail bounce, cron, smartd) - body_text = pve_message.strip()[:1000] - else: - try: - body_text = template['body'].format(**variables) - except (KeyError, ValueError): - body_text = template['body'] - - # Clean up: collapse runs of 3+ blank lines into 1, remove trailing whitespace - import re as _re - body_text = _re.sub(r'\n{3,}', '\n\n', body_text.strip()) - - severity = variables.get('severity', 'INFO') - group = template.get('group', 'system') - - # Build structured fields for Discord embeds / rich notifications - fields = [] - field_map = [ - ('vmid', 'VM/CT'), ('vmname', 'Name'), ('device', 'Device'), - ('source_ip', 'Source IP'), ('node_name', 'Node'), ('category', 'Category'), - ('service_name', 'Service'), ('jail', 'Jail'), ('username', 'User'), - ('count', 'Count'), ('window', 'Window'), ('entity_list', 'Affected'), - ] - for key, label in field_map: - val = variables.get(key, '') - if val: - fields.append((label, str(val))) - - # Build HTML body with escaped content - body_html_parts = [] - for line in body_text.split('\n'): - if line.strip(): - body_html_parts.append(f'

{html_mod.escape(line)}

') - body_html = '\n'.join(body_html_parts) if body_html_parts else f'

{html_mod.escape(body_text)}

' - - return { - 'title': title, - 'body': body_text, # backward compat - 'body_text': body_text, - 'body_html': body_html, - 'fields': fields, - 'tags': [severity, group, event_type], - 'severity': severity, - 'group': group, - } - - -def get_event_types_by_group() -> Dict[str, list]: - """Get all event types organized by group, for UI rendering. - - Returns: - {group_key: [{'type': event_type, 'title': template_title, - 'default_enabled': bool}, ...]} - """ - result = {} - for event_type, template in TEMPLATES.items(): - group = template.get('group', 'system') - if group not in result: - result[group] = [] - import re - # Clean title: remove {hostname}: prefix and any remaining {placeholders} - title = template['title'].replace('{hostname}', '').strip(': ') - title = re.sub(r'\s*\{[^}]+\}', '', title).strip(' -:') - if not title: - title = event_type.replace('_', ' ').title() - result[group].append({ - 'type': event_type, - 'title': title, - 'default_enabled': template.get('default_enabled', True), - }) - return result - - -def get_default_enabled_events() -> Dict[str, bool]: - """Get the default enabled state for all event types.""" - return { - event_type: template.get('default_enabled', True) - for event_type, template in TEMPLATES.items() - } - - -# ─── AI Enhancement (Optional) ─────────────────────────────────── - -class AIEnhancer: - """Optional AI message enhancement using external LLM API. - - Enriches template-generated messages with context and suggestions. - Falls back to original message if AI is unavailable or fails. - """ - - SYSTEM_PROMPT = """You are a Proxmox system administrator assistant. -You receive a notification message about a server event and must enhance it with: -1. A brief explanation of what this means in practical terms -2. A suggested action if applicable (1-2 sentences max) - -Keep the response concise (max 3 sentences total). Do not repeat the original message. -Respond in the same language as the input message.""" - - def __init__(self, provider: str, api_key: str, model: str = ''): - self.provider = provider.lower() - self.api_key = api_key - self.model = model - self._enabled = bool(api_key) - - @property - def enabled(self) -> bool: - return self._enabled - - def enhance(self, title: str, body: str, severity: str) -> Optional[str]: - """Enhance a notification message with AI context. - - Returns enhanced body text, or None if enhancement fails/disabled. - """ - if not self._enabled: - return None - - try: - if self.provider in ('openai', 'groq'): - return self._call_openai_compatible(title, body, severity) - except Exception as e: - print(f"[AIEnhancer] Enhancement failed: {e}") - - return None - - def _call_openai_compatible(self, title: str, body: str, severity: str) -> Optional[str]: - """Call OpenAI-compatible API (works with OpenAI, Groq, local).""" - if self.provider == 'groq': - url = 'https://api.groq.com/openai/v1/chat/completions' - model = self.model or 'llama-3.3-70b-versatile' - else: # openai - url = 'https://api.openai.com/v1/chat/completions' - model = self.model or 'gpt-4o-mini' - - user_msg = f"Severity: {severity}\nTitle: {title}\nMessage: {body}" - - payload = json.dumps({ - 'model': model, - 'messages': [ - {'role': 'system', 'content': self.SYSTEM_PROMPT}, - {'role': 'user', 'content': user_msg}, - ], - 'max_tokens': 150, - 'temperature': 0.3, - }).encode('utf-8') - - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}', - } - - req = urllib.request.Request(url, data=payload, headers=headers) - with urllib.request.urlopen(req, timeout=10) as resp: - result = json.loads(resp.read().decode('utf-8')) - content = result['choices'][0]['message']['content'].strip() - return content if content else None - - -def format_with_ai(title: str, body: str, severity: str, - ai_config: Dict[str, str]) -> str: - """Format a message with optional AI enhancement. - - If AI is configured and succeeds, appends AI insight to the body. - Otherwise returns the original body unchanged. - - Args: - title: Notification title - body: Notification body - severity: Severity level - ai_config: {'enabled': 'true', 'provider': 'groq', 'api_key': '...', 'model': ''} - - Returns: - Enhanced body string - """ - if ai_config.get('enabled') != 'true' or not ai_config.get('api_key'): - return body - - enhancer = AIEnhancer( - provider=ai_config.get('provider', 'groq'), - api_key=ai_config['api_key'], - model=ai_config.get('model', ''), - ) - - insight = enhancer.enhance(title, body, severity) - if insight: - return f"{body}\n\n---\n{insight}" - - return body