Files
ProxMenux/AppImage/scripts/notification_manager.py
T
2026-05-09 18:59:59 +02:00

2726 lines
122 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
ProxMenux Notification Manager
Central orchestrator for the notification service.
Connects:
- notification_channels.py (transport: Telegram, Gotify, Discord)
- notification_templates.py (message formatting + optional AI)
- notification_events.py (event detection: Journal, Task, Polling watchers)
- health_persistence.py (DB: config storage, notification_history)
Two interfaces consume this module:
1. Server mode: Flask imports and calls start()/stop()/send_notification()
2. CLI mode: `python3 notification_manager.py --action send --type vm_fail ...`
Scripts .sh in /usr/local/share/proxmenux/scripts call this directly.
Author: MacRimi
"""
import concurrent.futures
import ipaddress
import json
import os
import sys
import time
import socket
import sqlite3
import threading
import base64
from queue import Queue, Empty
from datetime import datetime
from typing import Dict, Any, List, Optional
from pathlib import Path
from urllib.parse import urlparse
# Ensure local imports work
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
if BASE_DIR not in sys.path:
sys.path.insert(0, BASE_DIR)
from notification_channels import create_channel, CHANNEL_TYPES
from notification_templates import (
render_template, format_with_ai, format_with_ai_full, enrich_with_emojis, TEMPLATES,
EVENT_GROUPS, get_event_types_by_group, get_default_enabled_events
)
from notification_events import (
JournalWatcher, TaskWatcher, PollingCollector, NotificationEvent,
ProxmoxHookWatcher,
)
# AI context enrichment (uptime, frequency, SMART data, known errors)
try:
from ai_context_enrichment import enrich_context_for_ai
except ImportError:
def enrich_context_for_ai(title, body, event_type, data, journal_context='', detail_level='standard'):
return journal_context
# ─── Constants ────────────────────────────────────────────────────
DB_PATH = Path('/usr/local/share/proxmenux/health_monitor.db')
SETTINGS_PREFIX = 'notification.'
ENCRYPTION_KEY_FILE = Path('/usr/local/share/proxmenux/.notification_key')
# Keys that contain sensitive data and should be encrypted
SENSITIVE_KEYS = {
'ai_api_key', # Legacy - kept for migration
'ai_api_key_groq',
'ai_api_key_gemini',
'ai_api_key_anthropic',
'ai_api_key_openai',
'ai_api_key_openrouter',
'telegram.token',
'gotify.token',
'discord.webhook_url',
'email.password',
'webhook_secret',
}
# Sentinel string sent to the UI in place of any populated sensitive value.
# When the UI POSTs settings back, fields whose value equals the placeholder
# are treated as "unchanged" and skipped — avoiding the bug where the user
# loaded the page (got placeholders), saved an unrelated field, and we wrote
# the placeholder over the real key. See audit Tier 2 #17c.
SENSITIVE_PLACEHOLDER = '************'
def _mask_if_set(value):
"""Return the placeholder when `value` looks populated, else empty string."""
if value is None:
return ''
s = str(value)
if not s:
return ''
return SENSITIVE_PLACEHOLDER
# ─── SSRF guard for user-supplied URLs (audit Tier 3 #19) ───────────────
#
# Used by both the REST routes (test-ai, provider-models) and `save_settings`
# below — so that storing a malicious `ai_ollama_url` or `ai_openai_base_url`
# is rejected at the boundary, not just when the user clicks "Test". The
# AI providers themselves consume these URLs in the dispatch path; if we
# only validated at test time, an attacker could persist `file://...` and
# it would fire every notification later.
_SSRF_BLOCKED_HOSTS = frozenset({
'metadata.google.internal',
'metadata.azure.com',
'metadata.aws.amazon.com',
'instance-data',
})
def validate_external_url(url, *, allow_loopback=False, max_len=512):
"""Return (ok: bool, error_msg: str). Refuses obviously-dangerous URLs.
`allow_loopback=True` is appropriate for local services like Ollama where
the documented default is `http://localhost:11434`. For any URL pointing
at a third-party API (OpenAI, Anthropic, ...) keep `allow_loopback=False`.
"""
if not url or not isinstance(url, str):
return False, "URL is required"
if len(url) > max_len:
return False, f"URL exceeds {max_len} characters"
try:
parsed = urlparse(url)
except ValueError:
return False, "URL is malformed"
if parsed.scheme not in ('http', 'https'):
return False, "Only http:// and https:// are accepted"
host = (parsed.hostname or '').lower()
if not host:
return False, "URL is missing a hostname"
if host in _SSRF_BLOCKED_HOSTS:
return False, "Hostname is blocked (cloud metadata service)"
try:
ip = ipaddress.ip_address(host)
except ValueError:
ip = None
if ip is not None:
if ip.is_link_local:
return False, "Link-local addresses are not allowed"
if ip.is_multicast or ip.is_reserved or ip.is_unspecified:
return False, "Reserved IP address class is not allowed"
if not allow_loopback and ip.is_loopback:
return False, "Loopback addresses are not allowed"
if not allow_loopback and ip.is_private:
return False, "Private (RFC1918) addresses are not allowed"
elif not allow_loopback and host in ('localhost', 'localhost.localdomain', 'ip6-localhost'):
return False, "Loopback hostnames are not allowed"
return True, ""
# ─── Hostname Resolution ─────────────────────────────────────────
def _resolve_display_hostname(config: Optional[Dict[str, str]] = None) -> str:
"""Resolve the hostname to use in notification titles/data.
Order of precedence:
1. The user-configured "Display Name" in notification settings (`hostname` key).
2. The system FQDN from `socket.gethostname()` — kept whole, NOT truncated at the
first dot. Multi-node deployments need the full FQDN to disambiguate hosts.
If `config` is None, the value is read from the SQLite settings table directly so
callers without a NotificationManager instance (e.g. EmailChannel.test) can use it.
"""
configured = ''
if config is not None:
configured = (config.get('hostname') or '').strip()
else:
try:
if DB_PATH.exists():
conn = sqlite3.connect(str(DB_PATH), timeout=5)
conn.execute('PRAGMA busy_timeout=2000')
row = conn.execute(
'SELECT setting_value FROM user_settings WHERE setting_key = ?',
(f'{SETTINGS_PREFIX}hostname',),
).fetchone()
conn.close()
if row and row[0]:
configured = str(row[0]).strip()
except Exception:
# If the DB is locked / missing / corrupt, silently fall through to FQDN.
configured = ''
if configured:
return configured
return socket.gethostname()
# ─── Encryption for Sensitive Data ───────────────────────────────
#
# Audit Tier 4 #24 flagged the previous implementation as trivially reversible:
# fixed-key XOR with a single global key, no authentication, no salting. Any
# attacker who got the DB plus `.notification_key` could decrypt every secret
# in seconds — and worse, two values encrypted under the same key let an
# attacker XOR their ciphertexts to recover (plaintext_a XOR plaintext_b),
# which often discloses both.
#
# We replace that with a stdlib-only authenticated-encryption scheme based on
# encrypt-then-MAC primitives we DO have (`hashlib.pbkdf2_hmac`, `hmac.new`):
#
# 1. Per-value 16-byte random salt.
# 2. Derive 64 bytes from the master key via PBKDF2-HMAC-SHA256 (50k iters).
# First 32B = stream key, last 32B = MAC key.
# 3. Build a keystream by HMAC-SHA256 in counter mode over the stream key
# and XOR it against the plaintext.
# 4. MAC := HMAC-SHA256(mac_key, salt || ciphertext).
# 5. Persist as `ENC2:<salt_b64>:<ciphertext_b64>:<mac_b64>`.
#
# Decryption verifies the MAC FIRST in constant time (hmac.compare_digest)
# and bails on mismatch, so tampered ciphertexts never produce output.
#
# Cross-value security: each value uses a fresh salt → different stream key,
# so XOR-pair recovery is closed. `cryptography` (Fernet/AES-GCM) is not
# bundled in the AppImage build today (see build_appimage.sh comment about
# PyO3 issues); when it is added, this scheme should be replaced — but the
# current implementation is light-years better than what it replaces.
# Old format prefix kept for read compatibility (lazy migration on save).
_LEGACY_ENC_PREFIX = "ENC:"
_NEW_ENC_PREFIX = "ENC2:"
_ENC_KDF_ITERS = 50000 # 50k is plenty given the master key is already random
_ENC_SALT_BYTES = 16
def _get_or_create_encryption_key() -> bytes:
"""Get or create the master encryption key for sensitive notification data."""
if ENCRYPTION_KEY_FILE.exists():
with open(ENCRYPTION_KEY_FILE, 'rb') as f:
return f.read()
# Create new random key. Use os.open with O_CREAT|O_EXCL|0o600 so the
# file is created with restrictive perms atomically — the previous
# `open(..., 'wb')` then `os.chmod` left a TOCTOU window where any
# other UID on the host could read the master key during the few
# milliseconds between create and chmod (umask defaults to 0o022,
# so the file briefly held mode 0o644). Audit Tier 6.
import secrets as _secrets
key = _secrets.token_bytes(32)
ENCRYPTION_KEY_FILE.parent.mkdir(parents=True, exist_ok=True)
try:
fd = os.open(
str(ENCRYPTION_KEY_FILE),
os.O_WRONLY | os.O_CREAT | os.O_EXCL,
0o600,
)
try:
os.write(fd, key)
finally:
os.close(fd)
return key
except FileExistsError:
# Another worker raced us — read what they wrote.
with open(ENCRYPTION_KEY_FILE, 'rb') as f:
return f.read()
def _derive_subkeys(master: bytes, salt: bytes) -> tuple:
"""Derive (stream_key, mac_key) from master+salt via PBKDF2-HMAC-SHA256."""
import hashlib as _hashlib
derived = _hashlib.pbkdf2_hmac('sha256', master, salt, _ENC_KDF_ITERS, dklen=64)
return derived[:32], derived[32:]
def _hmac_keystream(stream_key: bytes, length: int) -> bytes:
"""Produce `length` bytes of keystream via HMAC-SHA256 in counter mode."""
import hmac as _hmac
import hashlib as _hashlib
out = bytearray()
counter = 0
while len(out) < length:
block = _hmac.new(stream_key, counter.to_bytes(8, 'big'), _hashlib.sha256).digest()
out.extend(block)
counter += 1
return bytes(out[:length])
def encrypt_sensitive_value(value: str) -> str:
"""Encrypt a sensitive value with authenticated encryption.
Returns `ENC2:<salt_b64>:<ciphertext_b64>:<mac_b64>`. Idempotent: passing
an already-encrypted string in either ENC: or ENC2: form returns it as-is.
"""
if not value:
return value
if value.startswith(_NEW_ENC_PREFIX) or value.startswith(_LEGACY_ENC_PREFIX):
return value
import hmac as _hmac
import hashlib as _hashlib
import secrets as _secrets
master = _get_or_create_encryption_key()
salt = _secrets.token_bytes(_ENC_SALT_BYTES)
stream_key, mac_key = _derive_subkeys(master, salt)
plaintext = value.encode('utf-8')
keystream = _hmac_keystream(stream_key, len(plaintext))
ciphertext = bytes(p ^ k for p, k in zip(plaintext, keystream))
mac = _hmac.new(mac_key, salt + ciphertext, _hashlib.sha256).digest()
return (
f"{_NEW_ENC_PREFIX}"
f"{base64.b64encode(salt).decode('ascii')}:"
f"{base64.b64encode(ciphertext).decode('ascii')}:"
f"{base64.b64encode(mac).decode('ascii')}"
)
def _decrypt_legacy_xor(encrypted_b64: str) -> str:
"""Decrypt the old fixed-key XOR format. Used for one-time read on migration."""
key = _get_or_create_encryption_key()
encrypted_bytes = base64.b64decode(encrypted_b64)
decrypted = bytes(v ^ key[i % len(key)] for i, v in enumerate(encrypted_bytes))
return decrypted.decode('utf-8')
def decrypt_sensitive_value(encrypted: str) -> str:
"""Decrypt a sensitive value. Recognizes both ENC2 (new) and ENC (legacy)."""
if not encrypted:
return encrypted
import hmac as _hmac
import hashlib as _hashlib
# New authenticated format.
if encrypted.startswith(_NEW_ENC_PREFIX):
try:
body = encrypted[len(_NEW_ENC_PREFIX):]
salt_b64, ct_b64, mac_b64 = body.split(':', 2)
salt = base64.b64decode(salt_b64)
ciphertext = base64.b64decode(ct_b64)
received_mac = base64.b64decode(mac_b64)
except Exception as e:
print(f"[NotificationManager] Malformed ENC2 value: {e}")
return ''
master = _get_or_create_encryption_key()
stream_key, mac_key = _derive_subkeys(master, salt)
expected_mac = _hmac.new(mac_key, salt + ciphertext, _hashlib.sha256).digest()
if not _hmac.compare_digest(expected_mac, received_mac):
print("[NotificationManager] MAC verification failed — refusing to decrypt")
return ''
keystream = _hmac_keystream(stream_key, len(ciphertext))
try:
return bytes(c ^ k for c, k in zip(ciphertext, keystream)).decode('utf-8')
except Exception as e:
print(f"[NotificationManager] ENC2 decode error: {e}")
return ''
# Legacy XOR — kept for one-shot read so values stored under the old scheme
# can still be loaded. They get re-encrypted into ENC2 on the next save_settings.
if encrypted.startswith(_LEGACY_ENC_PREFIX):
try:
return _decrypt_legacy_xor(encrypted[len(_LEGACY_ENC_PREFIX):])
except Exception as e:
print(f"[NotificationManager] Failed to decrypt legacy ENC: {e}")
return ''
# No recognized prefix — caller stored the value cleartext (legacy data
# from before encryption was introduced). Return as-is.
return encrypted
# Cooldown defaults (seconds). Project-wide rule: an event with the same
# fingerprint never fires twice within 24h. If the underlying issue
# clears, no further pings; if it persists past the window, the next
# occurrence surfaces normally — exactly once per 24h. The narrow
# overrides further down in `_passes_cooldown` keep events that the user
# wants to see every time (backups completed, VM start/stop, shutdown)
# at their short cooldowns.
DEFAULT_COOLDOWNS = {
'CRITICAL': 86400,
'WARNING': 86400,
'INFO': 86400,
'resources': 86400,
'updates': 86400,
}
# ─── Storm Protection ────────────────────────────────────────────
GROUP_RATE_LIMITS = {
'security': {'max_per_minute': 5, 'max_per_hour': 30},
'storage': {'max_per_minute': 3, 'max_per_hour': 20},
'cluster': {'max_per_minute': 5, 'max_per_hour': 20},
'network': {'max_per_minute': 3, 'max_per_hour': 15},
'resources': {'max_per_minute': 3, 'max_per_hour': 20},
'vm_ct': {'max_per_minute': 10, 'max_per_hour': 60},
'backup': {'max_per_minute': 5, 'max_per_hour': 30},
'services': {'max_per_minute': 5, 'max_per_hour': 30},
'health': {'max_per_minute': 3, 'max_per_hour': 20},
'updates': {'max_per_minute': 3, 'max_per_hour': 15},
'other': {'max_per_minute': 5, 'max_per_hour': 30},
}
# Default fallback for unknown groups
_DEFAULT_RATE_LIMIT = {'max_per_minute': 5, 'max_per_hour': 30}
class GroupRateLimiter:
"""Rate limiter per event group. Prevents notification storms.
Thread-safe: allow() can be called concurrently from the dispatch thread,
journal watcher, polling collector, and webhook handler. Without the lock
the deques produced IndexError under storm conditions and the counts went
inconsistent (event A's prune saw event B's append mid-flight). Audit
Tier 6 (Notification stack #5).
"""
def __init__(self):
from collections import deque
self._deque = deque
self._minute_counts: Dict[str, Any] = {} # group -> deque[timestamp]
self._hour_counts: Dict[str, Any] = {} # group -> deque[timestamp]
self._lock = threading.Lock()
def allow(self, group: str) -> bool:
"""Check if group rate limit allows this event."""
limits = GROUP_RATE_LIMITS.get(group, _DEFAULT_RATE_LIMIT)
now = time.time()
with self._lock:
# Initialize if needed
if group not in self._minute_counts:
self._minute_counts[group] = self._deque()
self._hour_counts[group] = self._deque()
# Prune old entries
minute_q = self._minute_counts[group]
hour_q = self._hour_counts[group]
while minute_q and now - minute_q[0] > 60:
minute_q.popleft()
while hour_q and now - hour_q[0] > 3600:
hour_q.popleft()
# Check limits
if len(minute_q) >= limits['max_per_minute']:
return False
if len(hour_q) >= limits['max_per_hour']:
return False
# Record
minute_q.append(now)
hour_q.append(now)
return True
def get_stats(self) -> Dict[str, Dict[str, int]]:
"""Return current rate stats per group."""
now = time.time()
stats = {}
with self._lock:
# Snapshot under the lock so allow() concurrent mutations don't
# corrupt the iteration.
groups = list(self._minute_counts.keys())
snapshots = {
g: (list(self._minute_counts.get(g, [])), list(self._hour_counts.get(g, [])))
for g in groups
}
for group, (minute_q, hour_q) in snapshots.items():
stats[group] = {
'last_minute': sum(1 for t in minute_q if now - t <= 60),
'last_hour': sum(1 for t in hour_q if now - t <= 3600),
}
return stats
AGGREGATION_RULES = {
'auth_fail': {'window': 120, 'min_count': 3, 'burst_type': 'burst_auth_fail'},
'ip_block': {'window': 120, 'min_count': 3, 'burst_type': 'burst_ip_block'},
'disk_io_error': {'window': 60, 'min_count': 3, 'burst_type': 'burst_disk_io'},
'split_brain': {'window': 300, 'min_count': 2, 'burst_type': 'burst_cluster'},
'node_disconnect': {'window': 300, 'min_count': 2, 'burst_type': 'burst_cluster'},
'service_fail': {'window': 90, 'min_count': 2, 'burst_type': 'burst_service_fail'},
'service_fail_batch': {'window': 90, 'min_count': 2, 'burst_type': 'burst_service_fail'},
'system_problem': {'window': 90, 'min_count': 2, 'burst_type': 'burst_system'},
'oom_kill': {'window': 60, 'min_count': 2, 'burst_type': 'burst_generic'},
'firewall_issue': {'window': 60, 'min_count': 2, 'burst_type': 'burst_generic'},
}
# Default catch-all rule for any event type NOT listed above.
# This ensures that even unlisted event types get grouped when they
# burst, avoiding notification floods from any source.
_DEFAULT_AGGREGATION = {'window': 60, 'min_count': 2, 'burst_type': 'burst_generic'}
class BurstAggregator:
"""Accumulates similar events in a time window, then sends a single summary.
Examples:
- "Fail2Ban banned 17 IPs in 2 minutes"
- "Disk I/O errors: 34 events on /dev/sdb in 60s"
Memory caps (audit Tier 6 — `BurstAggregator` sin cap de memoria):
- At most `_MAX_EVENTS_PER_BUCKET` events kept per bucket. Beyond that
we increment a counter but drop the event payload, so a runaway
cascade can't OOM the dispatch process.
- At most `_MAX_BUCKETS` open buckets at any time. Beyond that, the
oldest bucket is force-flushed.
"""
_MAX_EVENTS_PER_BUCKET = 500 # Plenty for any reasonable burst window
_MAX_BUCKETS = 200 # Different (event_type, host) combinations
def __init__(self):
self._buckets: Dict[str, List] = {} # bucket_key -> [events]
self._deadlines: Dict[str, float] = {} # bucket_key -> flush_deadline
self._dropped: Dict[str, int] = {} # bucket_key -> count of events dropped past cap
self._lock = threading.Lock()
def ingest(self, event: NotificationEvent) -> Optional[NotificationEvent]:
"""Add event to aggregation. Returns:
- None if event is being buffered (wait for window)
- Original event if first in its bucket (sent immediately)
ALL event types are aggregated: specific rules from AGGREGATION_RULES
take priority, otherwise the _DEFAULT_AGGREGATION catch-all applies.
This prevents notification floods from any source.
"""
rule = AGGREGATION_RULES.get(event.event_type, _DEFAULT_AGGREGATION)
bucket_key = f"{event.event_type}:{event.data.get('hostname', '')}"
with self._lock:
if bucket_key not in self._buckets:
# Cap on number of distinct buckets — drop the oldest if we
# would exceed it. Without this, a worm-style attacker emitting
# events with random hostnames could create unbounded buckets.
if len(self._buckets) >= self._MAX_BUCKETS:
oldest_key = min(self._deadlines, key=self._deadlines.get)
self._buckets.pop(oldest_key, None)
self._deadlines.pop(oldest_key, None)
self._dropped.pop(oldest_key, None)
self._buckets[bucket_key] = []
self._deadlines[bucket_key] = time.time() + rule['window']
# Cap events-per-bucket. If we hit the limit, increment a drop
# counter so the eventual summary can say "+N more dropped".
if len(self._buckets[bucket_key]) >= self._MAX_EVENTS_PER_BUCKET:
self._dropped[bucket_key] = self._dropped.get(bucket_key, 0) + 1
return None
self._buckets[bucket_key].append(event)
# First event in bucket: pass through immediately so user gets fast alert
if len(self._buckets[bucket_key]) == 1:
return event
# Subsequent events: buffer (will be flushed as summary)
return None
def flush_expired(self) -> List[NotificationEvent]:
"""Flush all buckets past their deadline. Returns summary events."""
now = time.time()
summaries = []
with self._lock:
expired_keys = [k for k, d in self._deadlines.items() if now >= d]
for key in expired_keys:
events = self._buckets.pop(key, [])
del self._deadlines[key]
if len(events) < 2:
continue # Single event already sent on ingest, no summary needed
rule_type = key.split(':')[0]
rule = AGGREGATION_RULES.get(rule_type, {})
min_count = rule.get('min_count', 2)
if len(events) < min_count:
continue # Not enough events for a summary
summary = self._create_summary(events, rule)
if summary:
summaries.append(summary)
return summaries
def _create_summary(self, events: List[NotificationEvent],
rule: dict) -> Optional[NotificationEvent]:
"""Create a single summary event from multiple events.
Includes individual detail lines so the grouped message is
self-contained and the user can see exactly what happened.
"""
if not events:
return None
first = events[0]
# Determine highest severity
sev_order = {'INFO': 0, 'WARNING': 1, 'CRITICAL': 2}
max_severity = max(events, key=lambda e: sev_order.get(e.severity, 0)).severity
# Collect unique entity_ids
entity_ids = list(set(e.entity_id for e in events if e.entity_id))
entity_list = ', '.join(entity_ids[:10]) if entity_ids else 'multiple sources'
if len(entity_ids) > 10:
entity_list += f' (+{len(entity_ids) - 10} more)'
# Calculate window
window_secs = events[-1].ts_epoch - events[0].ts_epoch
if window_secs < 120:
window_str = f'{int(window_secs)}s'
else:
window_str = f'{int(window_secs / 60)}m'
burst_type = rule.get('burst_type', 'burst_generic')
# Build detail lines from individual events.
# For each event we extract the most informative field to show
# a concise one-line summary (e.g. "- service_fail: pvestatd").
detail_lines = []
for ev in events[1:]: # Skip first (already sent individually)
line = self._summarize_event(ev)
if line:
detail_lines.append(f" - {line}")
# Cap detail lines to avoid extremely long messages
details = ''
if detail_lines:
if len(detail_lines) > 15:
shown = detail_lines[:15]
shown.append(f" ... +{len(detail_lines) - 15} more")
details = '\n'.join(shown)
else:
details = '\n'.join(detail_lines)
data = {
'hostname': first.data.get('hostname') or _resolve_display_hostname(self._config),
'count': str(len(events)),
'window': window_str,
'entity_list': entity_list,
'event_type': first.event_type,
'details': details,
}
return NotificationEvent(
event_type=burst_type,
severity=max_severity,
data=data,
source='aggregator',
entity=first.entity,
entity_id='burst',
)
@staticmethod
def _summarize_event(event: NotificationEvent) -> str:
"""Extract a concise one-line summary from an event's data."""
d = event.data
etype = event.event_type
# Service failures: show service name
if etype in ('service_fail', 'service_fail_batch'):
return d.get('service_name', d.get('display_name', etype))
# System problems: first 120 chars of reason
if 'reason' in d:
reason = d['reason'].split('\n')[0][:120]
return reason
# Auth / IP: show username or IP
if 'username' in d:
return f"{etype}: {d['username']}"
if 'ip' in d:
return f"{etype}: {d['ip']}"
# VM/CT events: show vmid + name
if 'vmid' in d:
name = d.get('vmname', '')
return f"{etype}: {name} ({d['vmid']})" if name else f"{etype}: {d['vmid']}"
# Fallback: event type + entity_id
if event.entity_id:
return f"{etype}: {event.entity_id}"
return etype
# ─── AI rewrite timeout ────────────────────────────────────────────
# Hard cap on how long the AI rewrite can stall the dispatch thread.
# Ollama on slow CPUs can take 90-120 s per request; leaving the dispatch
# loop blocked that long delays every other event in the queue. We give up
# on the rewrite at AI_REWRITE_TIMEOUT_SECONDS and ship the non-AI version.
# Audit Tier 3.2 #2.
AI_REWRITE_TIMEOUT_SECONDS = 25
# A small dedicated pool so an in-flight AI call never starves the dispatch
# thread itself. One worker is enough — events serialize through the dispatch
# loop already.
_ai_rewrite_pool = concurrent.futures.ThreadPoolExecutor(
max_workers=2, thread_name_prefix='ai-rewrite'
)
def _format_with_ai_bounded(format_fn, *args, timeout=AI_REWRITE_TIMEOUT_SECONDS, **kwargs):
"""Run an AI rewrite with a hard wall-clock timeout.
Returns the rewriter's result on success, or `None` on timeout / exception.
The caller then keeps the original (non-AI) title/body. The future itself
is not cancelled — Python can't cancel an in-flight HTTP call cleanly —
but its result is discarded and the dispatch thread proceeds.
"""
future = _ai_rewrite_pool.submit(format_fn, *args, **kwargs)
try:
return future.result(timeout=timeout)
except concurrent.futures.TimeoutError:
print(f"[NotificationManager] AI rewrite exceeded {timeout}s — falling back to template")
return None
except Exception as e:
print(f"[NotificationManager] AI rewrite raised {type(e).__name__}: {e}")
return None
# ─── Notification Manager ─────────────────────────────────────────
class NotificationManager:
"""Central notification orchestrator.
Manages channels, event watchers, deduplication, and dispatch.
Can run in server mode (background threads) or CLI mode (one-shot).
"""
def __init__(self):
self._channels: Dict[str, Any] = {} # channel_name -> channel_instance
self._event_queue: Queue = Queue()
self._running = False
self._config: Dict[str, str] = {}
self._enabled = False
self._lock = threading.Lock()
# Watchers
self._journal_watcher: Optional[JournalWatcher] = None
self._task_watcher: Optional[TaskWatcher] = None
self._polling_collector: Optional[PollingCollector] = None
self._dispatch_thread: Optional[threading.Thread] = None
# Webhook receiver (no thread, passive)
self._hook_watcher: Optional[ProxmoxHookWatcher] = None
# Cooldown tracking: {fingerprint: last_sent_timestamp}
self._cooldowns: Dict[str, float] = {}
# Storm protection
self._group_limiter = GroupRateLimiter()
self._aggregator = BurstAggregator()
self._aggregation_thread: Optional[threading.Thread] = None
# Stats
self._stats = {
'started_at': None,
'total_sent': 0,
'total_errors': 0,
'last_sent_at': None,
}
# ─── Configuration ──────────────────────────────────────────
def _load_config(self):
"""Load notification settings from the shared SQLite database."""
self._config = {}
# Pairs of (full_key, plaintext) that were stored under the legacy ENC:
# XOR scheme. We re-encrypt them in ENC2 and persist atomically below.
# Audit Tier 4 #24.
legacy_migrations = []
try:
if not DB_PATH.exists():
return
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('PRAGMA busy_timeout=5000')
cursor = conn.cursor()
cursor.execute(
'SELECT setting_key, setting_value FROM user_settings WHERE setting_key LIKE ?',
(f'{SETTINGS_PREFIX}%',)
)
for key, value in cursor.fetchall():
# Strip prefix for internal use
short_key = key[len(SETTINGS_PREFIX):]
if short_key in SENSITIVE_KEYS and value:
if value.startswith(_LEGACY_ENC_PREFIX):
# Decrypt the legacy XOR value, queue a re-encrypt to ENC2
plaintext = decrypt_sensitive_value(value)
legacy_migrations.append((key, plaintext))
value = plaintext
elif value.startswith(_NEW_ENC_PREFIX):
value = decrypt_sensitive_value(value)
self._config[short_key] = value
# Persist any legacy → ENC2 upgrades from this load. Done in the
# same connection so a partial migration can't leave half ENC, half
# ENC2 — either all writes commit or we fall through to the next
# load attempt.
if legacy_migrations:
now_iso = datetime.now().isoformat()
migrated = 0
for full_key, plaintext in legacy_migrations:
try:
new_value = encrypt_sensitive_value(plaintext)
cursor.execute(
'INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at) VALUES (?, ?, ?)',
(full_key, new_value, now_iso),
)
migrated += 1
except Exception as e:
print(f"[NotificationManager] Failed to migrate {full_key} to ENC2: {e}")
conn.commit()
if migrated:
print(f"[NotificationManager] Migrated {migrated} legacy ENC values to ENC2")
conn.close()
except Exception as e:
print(f"[NotificationManager] Failed to load config: {e}")
# Reconcile per-event toggles with current template defaults.
# If a template's default_enabled was changed (e.g. state_change False),
# but the DB has a stale 'true' from a previous default, fix it now.
# Only override if the user hasn't explicitly set it (we track this with
# a sentinel: if the value came from auto-save of defaults, it may be stale).
for event_type, tmpl in TEMPLATES.items():
key = f'event.{event_type}'
if key in self._config:
db_val = self._config[key] == 'true'
tmpl_default = tmpl.get('default_enabled', True)
# If template says disabled but DB says enabled, AND there's no
# explicit user marker, enforce the template default.
if not tmpl_default and db_val:
# Check if user explicitly enabled it (look for a marker)
marker = f'event_explicit.{event_type}'
if marker not in self._config:
self._config[key] = 'false'
self._enabled = self._config.get('enabled', 'false') == 'true'
self._rebuild_channels()
def _save_setting(self, key: str, value: str):
"""Save a single notification setting to the database."""
full_key = f'{SETTINGS_PREFIX}{key}'
now = datetime.now().isoformat()
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('PRAGMA busy_timeout=5000')
cursor = conn.cursor()
cursor.execute('''
INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at)
VALUES (?, ?, ?)
''', (full_key, value, now))
conn.commit()
conn.close()
self._config[key] = value
except Exception as e:
print(f"[NotificationManager] Failed to save setting {key}: {e}")
def _rebuild_channels(self):
"""Rebuild channel instances from current config.
Builds the new dict in a local variable first, then atomically swaps
`self._channels` under the lock. Without this, a dispatch thread that
iterated `self._channels.items()` mid-rebuild could see a half-empty
dict and silently drop notifications. Audit Tier 6 (Notification
stack #6 — `_config`/`_channels` accedidos sin lock).
"""
new_channels: Dict[str, Any] = {}
for ch_type in CHANNEL_TYPES:
enabled_key = f'{ch_type}.enabled'
if self._config.get(enabled_key) != 'true':
continue
# Gather config keys for this channel
ch_config = {}
for config_key in CHANNEL_TYPES[ch_type]['config_keys']:
full_key = f'{ch_type}.{config_key}'
ch_config[config_key] = self._config.get(full_key, '')
channel = create_channel(ch_type, ch_config)
if channel:
valid, err = channel.validate_config()
if valid:
new_channels[ch_type] = channel
else:
print(f"[NotificationManager] Channel {ch_type} invalid: {err}")
with self._lock:
self._channels = new_channels
def reload_config(self):
"""Reload config from DB without restarting."""
with self._lock:
self._load_config()
return {'success': True, 'channels': list(self._channels.keys())}
# ─── Server Mode (Background) ──────────────────────────────
def start(self):
"""Start the notification service in server mode.
Launches watchers and dispatch loop as daemon threads.
Called by flask_server.py on startup.
"""
if self._running:
return
self._load_config()
self._load_cooldowns_from_db()
if not self._enabled:
print("[NotificationManager] Service is disabled. Skipping start.")
return
self._running = True
self._stats['started_at'] = datetime.now().isoformat()
# Ensure PVE webhook is configured (repairs priv config if missing)
try:
from flask_notification_routes import setup_pve_webhook_core
wh_result = setup_pve_webhook_core()
if wh_result.get('configured'):
print("[NotificationManager] PVE webhook configured OK.")
elif wh_result.get('error'):
print(f"[NotificationManager] PVE webhook warning: {wh_result['error']}")
except ImportError:
pass # flask_notification_routes not loaded yet (early startup)
except Exception as e:
print(f"[NotificationManager] PVE webhook setup error: {e}")
# Start event watchers
self._journal_watcher = JournalWatcher(self._event_queue)
self._task_watcher = TaskWatcher(self._event_queue)
self._polling_collector = PollingCollector(self._event_queue)
self._journal_watcher.start()
self._task_watcher.start()
self._polling_collector.start()
# Start dispatch loop
self._dispatch_thread = threading.Thread(
target=self._dispatch_loop, daemon=True, name='notification-dispatch'
)
self._dispatch_thread.start()
print(f"[NotificationManager] Started with channels: {list(self._channels.keys())}")
def stop(self):
"""Stop the notification service cleanly."""
self._running = False
if self._journal_watcher:
self._journal_watcher.stop()
if self._task_watcher:
self._task_watcher.stop()
if self._polling_collector:
self._polling_collector.stop()
print("[NotificationManager] Stopped.")
def _dispatch_loop(self):
"""Main dispatch loop: reads queue -> filters -> formats -> sends -> records."""
last_cleanup = time.monotonic()
last_flush = time.monotonic()
last_digest_check = time.monotonic()
cleanup_interval = 3600 # Cleanup cooldowns every hour
flush_interval = 5 # Flush aggregation buckets every 5s
digest_check_interval = 60 # Re-evaluate digest schedule every minute
while self._running:
try:
event = self._event_queue.get(timeout=2)
except Empty:
# Periodic maintenance during idle
now_mono = time.monotonic()
if now_mono - last_cleanup > cleanup_interval:
self._cleanup_old_cooldowns()
last_cleanup = now_mono
# Flush expired aggregation buckets
if now_mono - last_flush > flush_interval:
self._flush_aggregation()
last_flush = now_mono
if now_mono - last_digest_check > digest_check_interval:
self._maybe_flush_digests()
last_digest_check = now_mono
continue
try:
self._process_event(event)
except Exception as e:
print(f"[NotificationManager] Dispatch error: {e}")
# Also flush aggregation after each event
if time.monotonic() - last_flush > flush_interval:
self._flush_aggregation()
last_flush = time.monotonic()
def _flush_aggregation(self):
"""Flush expired aggregation buckets and dispatch summaries."""
try:
summaries = self._aggregator.flush_expired()
for summary_event in summaries:
# Burst summaries bypass aggregator but still pass cooldown + rate limit
self._process_event_direct(summary_event)
except Exception as e:
print(f"[NotificationManager] Aggregation flush error: {e}")
def _process_event(self, event: NotificationEvent):
"""Process a single event: aggregate -> cooldown -> rate limit -> dispatch.
Per-channel category/event filters are applied in _dispatch_to_channels().
No global category/event filter exists -- each channel decides independently.
"""
if not self._enabled:
return
# Try aggregation (may buffer the event)
result = self._aggregator.ingest(event)
if result is None:
return # Buffered, will be flushed as summary later
event = result # Use original event (first in burst passes through)
# From here, proceed with dispatch (shared with _process_event_direct)
self._dispatch_event(event)
def _process_event_direct(self, event: NotificationEvent):
"""Process a burst summary event. Bypasses aggregator but applies cooldown + rate limit."""
if not self._enabled:
return
self._dispatch_event(event)
def _dispatch_event(self, event: NotificationEvent):
"""Shared dispatch pipeline: cooldown -> rate limit -> render -> send."""
# Suppress VM/CT start/stop during active backups (second layer of defense).
# The primary filter is in TaskWatcher, but timing gaps can let events
# slip through. This catch-all filter checks at dispatch time.
# Exception: CRITICAL and WARNING events should always be notified.
_BACKUP_NOISE_TYPES = {'vm_start', 'vm_stop', 'vm_shutdown', 'vm_restart',
'ct_start', 'ct_stop', 'ct_shutdown', 'ct_restart'}
if event.event_type in _BACKUP_NOISE_TYPES and event.severity not in ('CRITICAL', 'WARNING'):
if self._is_backup_running():
return
# Check storage exclusions for storage-related events.
# If the storage is excluded from notifications, suppress the event entirely.
_STORAGE_EVENTS = {'storage_unavailable', 'storage_low_space', 'storage_warning', 'storage_error'}
if event.event_type in _STORAGE_EVENTS:
storage_name = event.data.get('storage_name') or event.data.get('name')
if storage_name:
try:
from health_persistence import health_persistence
if health_persistence.is_storage_excluded(storage_name, 'notifications'):
return # Storage is excluded from notifications, skip silently
except Exception:
pass # Continue if check fails
# Cooldown check (does NOT stamp yet — see audit Tier 6, cooldown order).
# If we stamped here, a rate-limit hit or a "no channel enabled for this
# event_type" situation would burn a 24h cooldown on a delivery that
# never reached anyone.
if not self._passes_cooldown(event):
return
# Group rate limit check.
template = TEMPLATES.get(event.event_type, {})
group = template.get('group', 'other')
if not self._group_limiter.allow(group):
return
# Use the properly mapped severity from the event, not from template defaults.
# event.severity was set by _map_severity which normalises to CRITICAL/WARNING/INFO.
severity = event.severity
# Inject the canonical severity into data so templates see it too.
event.data['severity'] = severity
# Render message from template (structured output)
rendered = render_template(event.event_type, event.data)
# Enrich data with structured fields for channels that support them
enriched_data = dict(event.data)
enriched_data['_rendered_fields'] = rendered.get('fields', [])
enriched_data['_body_html'] = rendered.get('body_html', '')
enriched_data['_event_type'] = event.event_type
enriched_data['_group'] = TEMPLATES.get(event.event_type, {}).get('group', 'other')
# Pass journal context if available (for AI enrichment)
if '_journal_context' in event.data:
enriched_data['_journal_context'] = event.data['_journal_context']
# Send through all active channels (AI applied per-channel with detail_level).
# Stamp cooldown only if at least one channel actually delivered — otherwise
# a misconfigured per-channel toggle would silently lock the event under a
# 24h cooldown until someone re-enables it. Audit Tier 6.
delivered = self._dispatch_to_channels(
rendered['title'], rendered['body'], severity,
event.event_type, enriched_data, event.source
)
if delivered:
self._record_cooldown(event.fingerprint)
def _dispatch_to_channels(self, title: str, body: str, severity: str,
event_type: str, data: Dict, source: str) -> bool:
"""Send notification through configured channels, respecting per-channel filters.
Each channel owns its own category/event preferences:
- {channel}.events.{group} = "true"/"false" (category toggle, default "true")
- {channel}.event.{type} = "true"/"false" (per-event toggle, default from template)
No global fallback -- each channel decides independently what it receives.
AI enhancement is applied per-channel with configurable detail level:
- {channel}.ai_detail_level = "brief" | "standard" | "detailed"
Returns True iff at least one channel actually delivered (success). The
caller uses this to decide whether to stamp the cooldown — see audit
Tier 6 (cooldown order interacts with per-channel toggles).
"""
delivered = False
with self._lock:
channels = dict(self._channels)
template = TEMPLATES.get(event_type, {})
event_group = template.get('group', 'other')
default_event_enabled = 'true' if template.get('default_enabled', True) else 'false'
# Build AI config once (shared across channels, detail_level varies)
# Use per-provider API key
ai_provider = self._config.get('ai_provider', 'groq')
ai_api_key = self._config.get(f'ai_api_key_{ai_provider}', '') or self._config.get('ai_api_key', '')
ai_config = {
'ai_enabled': self._config.get('ai_enabled', 'false'),
'ai_provider': ai_provider,
'ai_api_key': ai_api_key,
'ai_model': self._config.get('ai_model', ''),
'ai_language': self._config.get('ai_language', 'en'),
'ai_ollama_url': self._config.get('ai_ollama_url', ''),
# `ai_openai_base_url` was previously dropped from this dict and
# the downstream `notification_templates.AIRewriter` read it from
# the dict — meaning a user who configured LiteLLM / Azure as a
# custom base_url passed the "Test AI" check (which DOES pass it)
# but every real notification silently went to api.openai.com.
# Privacy + UX deception bug. Audit Tier 3.2 #1.
'ai_openai_base_url': self._config.get('ai_openai_base_url', ''),
'ai_prompt_mode': self._config.get('ai_prompt_mode', 'default'),
'ai_custom_prompt': self._config.get('ai_custom_prompt', ''),
}
# Get journal context if available (will be enriched per-channel based on detail_level)
raw_journal_context = data.get('_journal_context', '')
for ch_name, channel in channels.items():
# ── Per-channel category check ──
# Default: category enabled (true) unless explicitly disabled.
ch_group_key = f'{ch_name}.events.{event_group}'
if self._config.get(ch_group_key, 'true') == 'false':
continue # Channel has this category disabled
# ── Per-channel event check ──
# Default: from template default_enabled, unless explicitly set.
ch_event_key = f'{ch_name}.event.{event_type}'
if self._config.get(ch_event_key, default_event_enabled) == 'false':
continue # Channel has this specific event disabled
# ── Per-channel quiet hours ──
# The user marks a window (e.g. 22:00 → 06:00) during which only
# CRITICAL events reach this channel. Anything below CRITICAL is
# dropped silently — not buffered, not retried — because the
# whole point is "don't wake me up at 3 AM unless the disk
# exploded". CRITICAL always wins. The window is configured
# per-channel; same channel can have different rules from
# another. See _in_quiet_hours() for boundary semantics.
if event.severity != 'CRITICAL' and self._in_quiet_hours(ch_name):
continue
# ── Per-channel daily digest ──
# If the user has digest mode on for this channel, generic INFO
# events (backup_complete, update_summary, …) are buffered to a
# SQLite table instead of firing immediately. A single grouped
# summary is sent once a day at the configured time. CRITICAL
# and WARNING never wait — they always pass through. Events
# the user explicitly wants to see live (vm_start, etc.) are
# excluded from the digest by `_DIGEST_EXEMPT_EVENTS`.
if self._should_buffer_for_digest(ch_name, event):
self._buffer_digest_event(ch_name, event, title, body, event_group)
continue
try:
ch_title, ch_body = title, body
# ── Per-channel settings ──
# Email defaults to 'detailed' (technical report), others to 'standard'
detail_level_key = f'{ch_name}.ai_detail_level'
default_detail = 'detailed' if ch_name == 'email' else 'standard'
detail_level = self._config.get(detail_level_key, default_detail)
# Rich format (emojis) is a user preference per channel
rich_key = f'{ch_name}.rich_format'
use_rich_format = self._config.get(rich_key, 'false') == 'true'
# ── Per-channel AI enhancement ──
# Apply AI with channel-specific detail level and emoji setting
# If AI is enabled AND rich_format is on, AI will include emojis directly
# Pass channel_type so AI knows whether to append original (email only)
channel_ai_config = {**ai_config, 'channel_type': ch_name}
# Isolate the AI/enrich block in its own try so a failure
# here (raised from enrich_context_for_ai or any other
# helper not protected by _format_with_ai_bounded) does NOT
# abort the channel.send() below — the user still gets the
# raw template-formatted notification. Audit Tier 6 —
# `_dispatch_to_channels`: AI failure dropped the notification.
try:
enriched_context = enrich_context_for_ai(
title=ch_title,
body=ch_body,
event_type=event_type,
data=data,
journal_context=raw_journal_context,
detail_level=detail_level
)
# Wrap the AI rewrite with a hard timeout so a slow Ollama
# call (90-120 s on slow CPUs) doesn't stall the dispatch
# thread and delay every other queued event. On timeout we
# ship the non-AI title/body — the user still gets the
# notification, just without LLM polish. Audit Tier 3.2 #2.
ai_result = _format_with_ai_bounded(
format_with_ai_full,
ch_title, ch_body, severity, channel_ai_config,
detail_level=detail_level,
journal_context=enriched_context,
use_emojis=use_rich_format,
)
if ai_result is not None:
ch_title = ai_result.get('title', ch_title)
ch_body = ai_result.get('body', ch_body)
except Exception as ai_err:
print(f"[NotificationManager] AI enrich/rewrite failed for {ch_name}: "
f"{type(ai_err).__name__}: {ai_err} — sending template")
# Fallback emoji enrichment only if AI is disabled but rich_format is on
# (If AI processed the message with emojis, this is skipped)
ai_enabled_str = ai_config.get('ai_enabled', 'false')
ai_enabled = ai_enabled_str == 'true' if isinstance(ai_enabled_str, str) else bool(ai_enabled_str)
if use_rich_format and not ai_enabled:
ch_title, ch_body = enrich_with_emojis(
event_type, ch_title, ch_body, data
)
result = channel.send(ch_title, ch_body, severity, data)
self._record_history(
event_type, ch_name, ch_title, ch_body, severity,
result.get('success', False),
result.get('error', ''),
source
)
if result.get('success'):
self._stats['total_sent'] += 1
self._stats['last_sent_at'] = datetime.now().isoformat()
delivered = True
else:
self._stats['total_errors'] += 1
print(f"[NotificationManager] Send failed ({ch_name}): {result.get('error')}")
except Exception as e:
self._stats['total_errors'] += 1
self._record_history(
event_type, ch_name, title, body, severity,
False, str(e), source
)
return delivered
# ─── Cooldown / Dedup ───────────────────────────────────────
def _is_backup_running(self) -> bool:
"""Quick check if any vzdump process is currently active.
Reads /var/log/pve/tasks/active and also checks for vzdump processes.
"""
import os
# Method 1: Check active tasks file
try:
with open('/var/log/pve/tasks/active', 'r') as f:
for line in f:
if ':vzdump:' in line:
parts = line.strip().split(':')
if len(parts) >= 3:
try:
# PID in UPID is HEXADECIMAL
pid = int(parts[2], 16)
os.kill(pid, 0)
return True
except (ValueError, ProcessLookupError, PermissionError):
pass
except (OSError, IOError):
pass
# Method 2: Check for running vzdump processes directly
import subprocess
try:
result = subprocess.run(
['pgrep', '-x', 'vzdump'],
capture_output=True, timeout=2
)
if result.returncode == 0:
return True
except Exception:
pass
return False
def _in_quiet_hours(self, ch_name: str) -> bool:
"""True if the channel's quiet-hours window is active right now.
Settings keys (per channel, all strings in user_settings):
<ch>.quiet_enabled — "true" / "false" (default: "false")
<ch>.quiet_start — "HH:MM" (default: "22:00")
<ch>.quiet_end — "HH:MM" (default: "06:00")
The window is half-open [start, end) and may cross midnight
(start > end means "from tonight to tomorrow morning"). If start
equals end the window is treated as disabled — that's the only
way to express "no quiet hours" without a separate boolean. The
wall-clock comparison uses the host's local time (datetime.now())
so a user setting "22:0006:00" gets exactly that in their own
timezone, regardless of the Monitor's container TZ.
"""
if self._config.get(f'{ch_name}.quiet_enabled', 'false') != 'true':
return False
try:
start_s = self._config.get(f'{ch_name}.quiet_start', '22:00')
end_s = self._config.get(f'{ch_name}.quiet_end', '06:00')
sh, sm = (int(x) for x in start_s.split(':', 1))
eh, em = (int(x) for x in end_s.split(':', 1))
except (ValueError, AttributeError):
return False
if (sh, sm) == (eh, em):
return False
now = datetime.now()
cur = now.hour * 60 + now.minute
start = sh * 60 + sm
end = eh * 60 + em
if start < end:
return start <= cur < end
return cur >= start or cur < end
# Events that should NEVER be buffered into a digest — they are
# short-cooldown action / state events the user explicitly opted in
# to see live. Folding them into a daily digest would defeat that
# opt-in. Mirrors the same lists from _passes_cooldown.
_DIGEST_EXEMPT_EVENTS = frozenset({
'backup_complete', 'backup_fail', 'backup_start',
'replication_complete', 'replication_fail',
'vm_start', 'vm_stop', 'vm_shutdown', 'vm_restart',
'ct_start', 'ct_stop', 'ct_shutdown', 'ct_restart',
'vm_fail', 'ct_fail',
'system_shutdown', 'system_reboot',
})
def _should_buffer_for_digest(self, ch_name: str,
event: NotificationEvent) -> bool:
"""Decide if this event should go to the channel's digest buffer
instead of firing immediately. Only applies to generic INFO
events; CRITICAL and WARNING always pass through.
"""
if event.severity != 'INFO':
return False
if event.event_type in self._DIGEST_EXEMPT_EVENTS:
return False
if self._config.get(f'{ch_name}.digest_enabled', 'false') != 'true':
return False
return True
def _buffer_digest_event(self, ch_name: str, event: NotificationEvent,
title: str, body: str, event_group: str) -> None:
"""Append an event to the channel's digest buffer in SQLite."""
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('PRAGMA busy_timeout=5000')
conn.execute(
'INSERT INTO digest_pending '
'(channel, event_type, event_group, severity, ts, title, body) '
'VALUES (?, ?, ?, ?, ?, ?, ?)',
(ch_name, event.event_type, event_group, event.severity,
int(time.time()), title, body),
)
conn.commit()
conn.close()
except Exception as e:
print(f"[NotificationManager] digest buffer write failed: {e}")
def _maybe_flush_digests(self) -> None:
"""Per-channel: if the local clock has reached the configured
digest_time and we haven't flushed in the last 23h, send a
grouped summary of the buffered INFO events and clear the rows.
Idempotent — safe to call every minute from the dispatch loop.
"""
now = datetime.now()
cur_minute = now.hour * 60 + now.minute
for ch_name, channel in list(self._channels.items()):
if self._config.get(f'{ch_name}.digest_enabled', 'false') != 'true':
continue
time_str = self._config.get(f'{ch_name}.digest_time', '09:00')
try:
dh, dm = (int(x) for x in time_str.split(':', 1))
except (ValueError, AttributeError):
continue
target = dh * 60 + dm
# Fire when we're at the target minute *or* up to 5 min after,
# so a brief skip in the loop (busy dispatch) doesn't lose the
# day's digest. The last_at guard makes this idempotent.
if not (target <= cur_minute < target + 5):
continue
last_iso = self._config.get(f'{ch_name}.digest_last_at', '')
if last_iso:
try:
last_dt = datetime.fromisoformat(last_iso)
if (now - last_dt).total_seconds() < 23 * 3600:
continue # Already sent today
except ValueError:
pass
try:
self._flush_digest_for_channel(ch_name, channel, now)
except Exception as e:
print(f"[NotificationManager] digest flush failed for "
f"{ch_name}: {e}")
def _flush_digest_for_channel(self, ch_name: str, channel: Any,
now: datetime) -> None:
"""Read pending rows for the channel, render a grouped summary,
send it, and delete the buffer entries on success."""
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
cursor = conn.cursor()
cursor.execute(
'SELECT id, event_type, event_group, ts, title, body '
'FROM digest_pending WHERE channel = ? ORDER BY ts ASC',
(ch_name,),
)
rows = cursor.fetchall()
conn.close()
except Exception as e:
print(f"[NotificationManager] digest read failed for {ch_name}: {e}")
return
# Mark `last_at` even if there's nothing to send — otherwise an
# empty buffer keeps re-triggering the 5-minute window for hours.
self._save_setting(f'{ch_name}.digest_last_at', now.isoformat())
if not rows:
return
host = _hostname(self._config)
summary_title = (
f"{host}: 24h summary ({now.strftime('%Y-%m-%d %H:%M')})"
)
summary_body = self._compose_digest_body(rows)
try:
channel.send(summary_title, summary_body, severity='INFO',
data={'_digest': True, '_count': len(rows)})
except Exception as e:
print(f"[NotificationManager] digest send failed for "
f"{ch_name}: {e}")
return
# Delete only after a successful send so a transient failure
# doesn't lose the day's data.
try:
ids = [r[0] for r in rows]
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
placeholders = ','.join('?' * len(ids))
conn.execute(
f'DELETE FROM digest_pending WHERE id IN ({placeholders})',
ids,
)
conn.commit()
conn.close()
except Exception as e:
print(f"[NotificationManager] digest cleanup failed for "
f"{ch_name}: {e}")
def _compose_digest_body(self, rows: list) -> str:
"""Render a grouped summary body. rows is a list of
(id, event_type, event_group, ts, title, body) tuples ordered
by timestamp ASC.
"""
from collections import OrderedDict
groups: OrderedDict[str, list] = OrderedDict()
for _id, ev_type, group, ts, title, body in rows:
label = group or 'other'
groups.setdefault(label, []).append((ts, ev_type, title))
lines = [f"{len(rows)} INFO events grouped by category:\n"]
for group, items in groups.items():
lines.append(f"{group.title()}: {len(items)}")
for ts, ev_type, title in items[:8]:
hhmm = datetime.fromtimestamp(ts).strftime('%H:%M')
short_title = title.split(': ', 1)[-1] if ': ' in title else title
lines.append(f"{hhmm} {short_title}")
if len(items) > 8:
lines.append(f" • … and {len(items) - 8} more")
lines.append('')
lines.append(
'(Critical/Warning events arrived at the time they happened, '
'not in this digest.)'
)
return '\n'.join(lines).rstrip() + '\n'
def _passes_cooldown(self, event: NotificationEvent) -> bool:
"""Check if the event passes cooldown rules WITHOUT stamping.
Splits the historical `_check_cooldown` into a pure predicate plus
`_record_cooldown` (separate stamp). Lets the caller check rate-limit
and per-channel filters first — if any of those drop the event, we
avoid burning a 24h cooldown on a delivery that never happened.
Audit Tier 6 (Notification stack #4 + cooldown/per-channel interaction).
"""
now = time.time()
# Determine cooldown period
template = TEMPLATES.get(event.event_type, {})
group = template.get('group', 'system')
# Priority: per-type config > per-severity > default
cooldown_key = f'cooldown.{event.event_type}'
cooldown_str = self._config.get(cooldown_key)
if cooldown_str is None:
cooldown_key_group = f'cooldown.{group}'
cooldown_str = self._config.get(cooldown_key_group)
if cooldown_str is not None:
cooldown = int(cooldown_str)
else:
cooldown = DEFAULT_COOLDOWNS.get(event.severity, 86400)
# Health state-change events (CPU at 95%, memory full, temperature
# critical, disk near capacity, ...) describe a SUSTAINED state, not
# a one-shot bug. Sitting at 24h means the user gets one ping when
# the threshold trips and then silence even if the condition lasts
# for days; the 30min/1h cadence here keeps them reminded while the
# condition persists, but coarse enough to not flood the channel.
# The 24h default takes over for unique bug/error events
# (system_problem, auth_fail, ...) that get nothing from being
# repeated.
_HEALTH_CATEGORY_COOLDOWNS = {
'disks': 86400, 'smart': 86400, 'zfs': 86400, 'updates': 86400,
'storage': 3600, 'temperature': 3600, 'logs': 3600,
'security': 3600, 'disk': 3600,
'network': 1800, 'pve_services': 1800,
'vms': 1800, 'cpu': 1800, 'memory': 1800,
}
if event.event_type == 'state_change' and event.source == 'health':
cat = (event.data or {}).get('category', '')
cat_cd = _HEALTH_CATEGORY_COOLDOWNS.get(cat)
if cat_cd is not None and cooldown_str is None:
cooldown = cat_cd
# Backup/replication events: each execution is unique and should
# always be delivered. A 10s cooldown prevents exact duplicates
# (webhook + tasks) but allows repeated backup jobs to report.
_ALWAYS_DELIVER = {'backup_complete', 'backup_fail', 'backup_start',
'replication_complete', 'replication_fail'}
if event.event_type in _ALWAYS_DELIVER and cooldown_str is None:
cooldown = 10
# VM/CT state changes are real user actions that should always be
# delivered. Each start/stop/shutdown is a distinct event. A 5s
# cooldown prevents exact duplicates from concurrent watchers.
_STATE_EVENTS = {
'vm_start', 'vm_stop', 'vm_shutdown', 'vm_restart',
'ct_start', 'ct_stop', 'ct_shutdown', 'ct_restart',
'vm_fail', 'ct_fail',
}
if event.event_type in _STATE_EVENTS and cooldown_str is None:
cooldown = 5
# System shutdown/reboot must be delivered immediately -- the node
# is going down and there may be only seconds to send the message.
_URGENT_EVENTS = {'system_shutdown', 'system_reboot'}
if event.event_type in _URGENT_EVENTS and cooldown_str is None:
cooldown = 5
# Check against last sent time using stable fingerprint. Stamp is
# deferred to `_record_cooldown()` — only invoked once the event has
# passed rate-limit AND at least one channel actually delivered it.
last_sent = self._cooldowns.get(event.fingerprint, 0)
if now - last_sent < cooldown:
return False
return True
def _record_cooldown(self, fingerprint: str):
"""Stamp the cooldown for a fingerprint that was actually delivered."""
now = time.time()
self._cooldowns[fingerprint] = now
self._persist_cooldown(fingerprint, now)
def _load_cooldowns_from_db(self):
"""Load persistent cooldown state from SQLite (up to 48h).
The `notification_last_sent` table is shared with PollingCollector
(`health_*` prefix) and JournalWatcher (`diskio_*`, `fs_*`,
`fs_serial_*`, plain device names). The manager's own fingerprints
always look like `<host>:<entity>:<...>:<event_type>` (contain `:`).
Loading the OTHER modules' fingerprints into `self._cooldowns` made
the manager's own dispatch see ghost cooldowns from rows it never
wrote — which then failed `_passes_cooldown` and dropped events
that should have gone through. Audit Tier 6 (Notification stack —
`_load_cooldowns_from_db` carga TODOS los fingerprints).
"""
try:
if not DB_PATH.exists():
return
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
cursor = conn.cursor()
cursor.execute('SELECT fingerprint, last_sent_ts FROM notification_last_sent')
now = time.time()
_OTHER_PREFIXES = ('health_', 'diskio_', 'fs_', 'fs_serial_')
loaded = 0
for fp, ts in cursor.fetchall():
if not fp:
continue
# Skip rows owned by other producers; they manage their own state.
if fp.startswith(_OTHER_PREFIXES):
continue
# The manager's fingerprint format always contains `:`.
if ':' not in fp:
continue
if now - ts < 172800: # 48h window
self._cooldowns[fp] = ts
loaded += 1
conn.close()
if loaded:
print(f"[NotificationManager] Loaded {loaded} cooldowns from DB")
except Exception as e:
print(f"[NotificationManager] Failed to load cooldowns: {e}")
def _persist_cooldown(self, fingerprint: str, ts: float):
"""Save cooldown timestamp to SQLite for restart persistence."""
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('PRAGMA busy_timeout=5000')
conn.execute('''
INSERT OR REPLACE INTO notification_last_sent (fingerprint, last_sent_ts, count)
VALUES (?, ?, COALESCE(
(SELECT count + 1 FROM notification_last_sent WHERE fingerprint = ?), 1
))
''', (fingerprint, int(ts), fingerprint))
conn.commit()
conn.close()
except Exception:
pass # Non-critical, in-memory cooldown still works
def _cleanup_old_cooldowns(self):
"""Remove cooldown entries older than 48h from both memory and DB."""
cutoff = time.time() - 172800 # 48h
self._cooldowns = {k: v for k, v in self._cooldowns.items() if v > cutoff}
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('DELETE FROM notification_last_sent WHERE last_sent_ts < ?', (int(cutoff),))
conn.commit()
conn.close()
except Exception:
pass
# ─── History Recording ──────────────────────────────────────
def _record_history(self, event_type: str, channel: str, title: str,
message: str, severity: str, success: bool,
error_message: str, source: str):
"""Record a notification attempt in the history table."""
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('PRAGMA busy_timeout=5000')
cursor = conn.cursor()
cursor.execute('''
INSERT INTO notification_history
(event_type, channel, title, message, severity, sent_at, success, error_message, source)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
event_type, channel, title, message[:500], severity,
datetime.now().isoformat(), 1 if success else 0,
error_message[:500] if error_message else None, source
))
conn.commit()
conn.close()
except Exception as e:
print(f"[NotificationManager] History record error: {e}")
# ─── Public API (used by Flask routes and CLI) ──────────────
def emit_event(self, event_type: str, severity: str, data: Dict,
source: str = 'api', entity: str = 'node', entity_id: str = '') -> Dict[str, Any]:
"""Emit an event through the notification system.
This creates a NotificationEvent and processes it through the normal pipeline,
including toggle checks, template rendering, and cooldown.
Used by internal endpoints like the shutdown notification hook.
Args:
event_type: Type of event (must match TEMPLATES key)
severity: INFO, WARNING, CRITICAL
data: Event data for template rendering
source: Origin of event
entity: Entity type (node, vm, ct, storage, etc.)
entity_id: Entity identifier
"""
from notification_events import NotificationEvent
event = NotificationEvent(
event_type=event_type,
severity=severity,
data=data,
source=source,
entity=entity,
entity_id=entity_id,
)
# For urgent events (shutdown/reboot), dispatch directly to ensure
# immediate delivery before the system goes down.
# For other events, use the normal pipeline with aggregation.
_URGENT_EVENTS = {'system_shutdown', 'system_reboot'}
if event_type in _URGENT_EVENTS:
self._dispatch_event(event)
return {'success': True, 'event_type': event_type, 'dispatched': 'direct'}
else:
self._process_event(event)
return {'success': True, 'event_type': event_type, 'dispatched': 'queued'}
def send_notification(self, event_type: str, severity: str,
title: str, message: str,
data: Optional[Dict] = None,
source: str = 'api',
skip_toggle_check: bool = False) -> Dict[str, Any]:
"""Send a notification directly (bypasses queue and cooldown).
Used by CLI and API for explicit sends.
Args:
event_type: Type of event (must match TEMPLATES key)
severity: INFO, WARNING, CRITICAL
title: Notification title
message: Notification body
data: Extra data for template rendering
source: Origin of notification (api, cli, health_monitor, etc.)
skip_toggle_check: If True, send even if event toggle is disabled.
Use for 'custom' or 'other' events that should always send.
"""
if not self._channels:
self._load_config()
if not self._channels:
return {
'success': False,
'error': 'No channels configured or enabled',
'channels_sent': [],
}
# Check if this event type is enabled (unless explicitly skipped)
# 'custom' and 'other' events always send (used for manual/script notifications)
if not skip_toggle_check and event_type not in ('custom', 'other'):
if not self.is_event_enabled(event_type):
return {
'success': False,
'error': f'Event type "{event_type}" is disabled in notification settings',
'channels_sent': [],
'skipped': True,
}
# Render template if available
if event_type in TEMPLATES and not message:
rendered = render_template(event_type, data or {})
title = title or rendered['title']
message = rendered['body']
severity = severity or rendered['severity']
# AI config for enhancement - use per-provider API key
ai_provider = self._config.get('ai_provider', 'groq')
ai_api_key = self._config.get(f'ai_api_key_{ai_provider}', '') or self._config.get('ai_api_key', '')
ai_config = {
'ai_enabled': self._config.get('ai_enabled', 'false'),
'ai_provider': ai_provider,
'ai_api_key': ai_api_key,
'ai_model': self._config.get('ai_model', ''),
'ai_language': self._config.get('ai_language', 'en'),
'ai_ollama_url': self._config.get('ai_ollama_url', ''),
# `ai_openai_base_url` was previously dropped from this dict and
# the downstream `notification_templates.AIRewriter` read it from
# the dict — meaning a user who configured LiteLLM / Azure as a
# custom base_url passed the "Test AI" check (which DOES pass it)
# but every real notification silently went to api.openai.com.
# Privacy + UX deception bug. Audit Tier 3.2 #1.
'ai_openai_base_url': self._config.get('ai_openai_base_url', ''),
'ai_prompt_mode': self._config.get('ai_prompt_mode', 'default'),
'ai_custom_prompt': self._config.get('ai_custom_prompt', ''),
}
results = {}
channels_sent = []
errors = []
with self._lock:
channels = dict(self._channels)
for ch_name, channel in channels.items():
try:
# Apply AI enhancement per channel with its detail level and emoji setting
detail_level_key = f'{ch_name}.ai_detail_level'
detail_level = self._config.get(detail_level_key, 'standard')
rich_key = f'{ch_name}.rich_format'
use_rich_format = self._config.get(rich_key, 'false') == 'true'
# Pass channel_type so AI knows whether to append original (email only)
channel_ai_config = {**ai_config, 'channel_type': ch_name}
ai_result = format_with_ai_full(
title, message, severity, channel_ai_config,
detail_level=detail_level,
use_emojis=use_rich_format
)
ch_title = ai_result.get('title', title)
ch_message = ai_result.get('body', message)
result = channel.send(ch_title, ch_message, severity, data)
results[ch_name] = result
self._record_history(
event_type, ch_name, ch_title, ch_message, severity,
result.get('success', False),
result.get('error', ''),
source
)
if result.get('success'):
channels_sent.append(ch_name)
else:
errors.append(f"{ch_name}: {result.get('error')}")
except Exception as e:
errors.append(f"{ch_name}: {str(e)}")
return {
'success': len(channels_sent) > 0,
'channels_sent': channels_sent,
'errors': errors,
'total_channels': len(channels),
}
def send_raw(self, title: str, message: str,
severity: str = 'INFO',
source: str = 'api') -> Dict[str, Any]:
"""Send a raw message without template (for custom scripts)."""
return self.send_notification(
'custom', severity, title, message, source=source
)
def test_channel(self, channel_name: str = 'all') -> Dict[str, Any]:
"""Test one or all configured channels with AI enhancement."""
if not self._channels:
self._load_config()
if not self._channels:
return {'success': False, 'error': 'No channels configured'}
results = {}
if channel_name == 'all':
targets = dict(self._channels)
elif channel_name in self._channels:
targets = {channel_name: self._channels[channel_name]}
else:
# Try to create channel from config even if not enabled
ch_config = {}
for config_key in CHANNEL_TYPES.get(channel_name, {}).get('config_keys', []):
ch_config[config_key] = self._config.get(f'{channel_name}.{config_key}', '')
channel = create_channel(channel_name, ch_config)
if channel:
targets = {channel_name: channel}
else:
return {'success': False, 'error': f'Channel {channel_name} not configured'}
# AI config for enhancement - use per-provider API key
ai_provider = self._config.get('ai_provider', 'groq')
ai_api_key = self._config.get(f'ai_api_key_{ai_provider}', '') or self._config.get('ai_api_key', '')
ai_config = {
'ai_enabled': self._config.get('ai_enabled', 'false'),
'ai_provider': ai_provider,
'ai_api_key': ai_api_key,
'ai_model': self._config.get('ai_model', ''),
'ai_language': self._config.get('ai_language', 'en'),
'ai_ollama_url': self._config.get('ai_ollama_url', ''),
# `ai_openai_base_url` was previously dropped from this dict and
# the downstream `notification_templates.AIRewriter` read it from
# the dict — meaning a user who configured LiteLLM / Azure as a
# custom base_url passed the "Test AI" check (which DOES pass it)
# but every real notification silently went to api.openai.com.
# Privacy + UX deception bug. Audit Tier 3.2 #1.
'ai_openai_base_url': self._config.get('ai_openai_base_url', ''),
'ai_prompt_mode': self._config.get('ai_prompt_mode', 'default'),
'ai_custom_prompt': self._config.get('ai_custom_prompt', ''),
}
ai_enabled = self._config.get('ai_enabled', 'false')
if isinstance(ai_enabled, str):
ai_enabled = ai_enabled.lower() == 'true'
ai_language = self._config.get('ai_language', 'en')
ai_prompt_mode = self._config.get('ai_prompt_mode', 'default')
# Determine AI info string based on prompt mode
if ai_prompt_mode == 'custom':
ai_info = f'{ai_provider} / custom prompt'
else:
ai_info = f'{ai_provider} / {ai_language}'
# ProxMenux logo for welcome message
logo_url = 'https://proxmenux.com/telegram.png'
logo_caption = 'You can use this image as the profile photo for your notification bot.'
for ch_name, channel in targets.items():
try:
# Get per-channel settings
detail_level_key = f'{ch_name}.ai_detail_level'
detail_level = self._config.get(detail_level_key, 'standard')
rich_key = f'{ch_name}.rich_format'
use_rich_format = self._config.get(rich_key, 'false') == 'true'
# Build status indicators for icons and AI, adapted to channel format
if use_rich_format:
icon_status = '✅ Icons: enabled'
ai_status = f'✅ AI: enabled ({ai_info})' if ai_enabled else '❌ AI: disabled'
else:
icon_status = 'Icons: disabled'
ai_status = f'AI: enabled ({ai_info})' if ai_enabled else 'AI: disabled'
# Base test message — shows current channel config
# NOTE: narrative lines are intentionally unlabeled so the AI
# does not prepend "Message:" or other spurious field labels.
base_title = 'ProxMenux Test'
base_message = (
'Welcome to ProxMenux Monitor!\n\n'
'This is a test message to verify your notification channel is working correctly.\n\n'
'Channel configuration:\n'
f'{icon_status}\n'
f'{ai_status}\n\n'
'You will receive alerts about system events, health status changes, and security incidents.'
)
# Apply AI enhancement (translates to configured language)
# Pass channel_type so AI knows whether to append original (email only)
channel_ai_config = {**ai_config, 'channel_type': ch_name}
ai_result = format_with_ai_full(
base_title, base_message, 'INFO', channel_ai_config,
detail_level=detail_level,
use_emojis=use_rich_format
)
enhanced_title = ai_result.get('title', base_title)
enhanced_message = ai_result.get('body', base_message)
# Send message
send_result = channel.send(enhanced_title, enhanced_message, 'INFO')
success = send_result.get('success', False)
error = send_result.get('error', '')
# For Telegram: send logo with caption suggesting it as bot profile photo
if success and ch_name == 'telegram' and hasattr(channel, 'send_photo'):
# Translate caption if AI is active
if ai_enabled:
caption_result = format_with_ai_full(
'', logo_caption, 'INFO', channel_ai_config,
detail_level='brief', use_emojis=use_rich_format
)
caption = caption_result.get('body', logo_caption)
else:
caption = logo_caption
try:
channel.send_photo(logo_url, caption=caption)
except TypeError:
# Fallback if send_photo doesn't support caption parameter
channel.send_photo(logo_url)
results[ch_name] = {'success': success, 'error': error}
self._record_history(
'test', ch_name, enhanced_title,
enhanced_message[:500], 'INFO',
success, error, 'api'
)
except Exception as e:
results[ch_name] = {'success': False, 'error': str(e)}
overall_success = any(r['success'] for r in results.values())
return {
'success': overall_success,
'results': results,
}
# ─── Proxmox Webhook ──────────────────────────────────────────
def process_webhook(self, payload: dict) -> dict:
"""Process incoming Proxmox webhook. Delegates to ProxmoxHookWatcher."""
if not self._hook_watcher:
self._hook_watcher = ProxmoxHookWatcher(self._event_queue)
return self._hook_watcher.process_webhook(payload)
def get_webhook_secret(self) -> str:
"""Get configured webhook secret, or empty string if none."""
if not self._config:
self._load_config()
return self._config.get('webhook_secret', '')
def get_webhook_allowed_ips(self) -> list:
"""Get list of allowed IPs for webhook, or empty list (allow all)."""
if not self._config:
self._load_config()
raw = self._config.get('webhook_allowed_ips', '')
if not raw:
return []
return [ip.strip() for ip in str(raw).split(',') if ip.strip()]
# ─── Status & Settings ──────────────────────────────────────
def get_status(self) -> Dict[str, Any]:
"""Get current service status."""
if not self._config:
self._load_config()
return {
'enabled': self._enabled,
'running': self._running,
'channels': {
name: {
'type': name,
'connected': True,
}
for name in self._channels
},
'stats': self._stats,
'watchers': {
'journal': self._journal_watcher is not None and self._running,
'task': self._task_watcher is not None and self._running,
'polling': self._polling_collector is not None and self._running,
},
}
def set_enabled(self, enabled: bool) -> Dict[str, Any]:
"""Enable or disable the notification service."""
self._save_setting('enabled', 'true' if enabled else 'false')
self._enabled = enabled
if enabled and not self._running:
self.start()
elif not enabled and self._running:
self.stop()
return {'success': True, 'enabled': enabled}
def is_event_enabled(self, event_type: str) -> bool:
"""Check if a specific event type is enabled for notifications.
Returns True if ANY active channel has this event enabled.
Returns False only if ALL channels have explicitly disabled this event.
Used by callers like health_polling_thread to skip notifications
for disabled events.
The UI stores toggles per-channel as '{channel}.event.{event_type}'.
We check all configured channels - if any has it enabled, return True.
"""
if not self._config:
self._load_config()
# Get template info for default state
tmpl = TEMPLATES.get(event_type, {})
default_enabled = 'true' if tmpl.get('default_enabled', True) else 'false'
event_group = tmpl.get('group', 'other')
# Check each configured channel. The list MUST come from CHANNEL_TYPES
# (the canonical registry of channel implementations) — a hardcoded
# list with `ntfy`/`pushover`/`slack` referenced channels that don't
# exist as implementations, leaving events checked against `.enabled`
# toggles that nobody could toggle. Audit Tier 6 (Notification stack #7).
channel_types = list(CHANNEL_TYPES.keys())
active_channels = []
for ch_name in channel_types:
if self._config.get(f'{ch_name}.enabled', 'false') == 'true':
active_channels.append(ch_name)
# If no channels are configured, consider events as "disabled"
# (no point generating notifications with no destination)
if not active_channels:
return False
# Check if ANY active channel has this event enabled
for ch_name in active_channels:
# First check category toggle for this channel
ch_group_key = f'{ch_name}.events.{event_group}'
if self._config.get(ch_group_key, 'true') == 'false':
continue # Category disabled for this channel
# Then check event-specific toggle
ch_event_key = f'{ch_name}.event.{event_type}'
if self._config.get(ch_event_key, default_enabled) == 'true':
return True # At least one channel has it enabled
# All active channels have this event disabled
return False
def list_channels(self) -> Dict[str, Any]:
"""List all channel types with their configuration status."""
if not self._config:
self._load_config()
channels_info = {}
for ch_type, info in CHANNEL_TYPES.items():
enabled = self._config.get(f'{ch_type}.enabled', 'false') == 'true'
configured = all(
bool(self._config.get(f'{ch_type}.{k}', ''))
for k in info['config_keys']
)
channels_info[ch_type] = {
'name': info['name'],
'enabled': enabled,
'configured': configured,
'active': ch_type in self._channels,
}
return {'channels': channels_info}
def get_history(self, limit: int = 50, offset: int = 0,
severity: str = '', channel: str = '') -> Dict[str, Any]:
"""Get notification history with optional filters."""
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('PRAGMA busy_timeout=5000')
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
query = 'SELECT * FROM notification_history WHERE 1=1'
params: list = []
if severity:
query += ' AND severity = ?'
params.append(severity)
if channel:
query += ' AND channel = ?'
params.append(channel)
query += ' ORDER BY sent_at DESC LIMIT ? OFFSET ?'
params.extend([limit, offset])
cursor.execute(query, params)
rows = [dict(row) for row in cursor.fetchall()]
# Get total count
count_query = 'SELECT COUNT(*) FROM notification_history WHERE 1=1'
count_params: list = []
if severity:
count_query += ' AND severity = ?'
count_params.append(severity)
if channel:
count_query += ' AND channel = ?'
count_params.append(channel)
cursor.execute(count_query, count_params)
total = cursor.fetchone()[0]
conn.close()
return {
'history': rows,
'total': total,
'limit': limit,
'offset': offset,
}
except Exception as e:
return {'history': [], 'total': 0, 'error': str(e)}
def clear_history(self) -> Dict[str, Any]:
"""Clear all notification history."""
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('PRAGMA busy_timeout=5000')
conn.execute('DELETE FROM notification_history')
conn.commit()
conn.close()
return {'success': True}
except Exception as e:
return {'success': False, 'error': str(e)}
def get_settings(self) -> Dict[str, Any]:
"""Get all notification settings for the UI.
Returns a structure matching the frontend's NotificationConfig shape
so the round-trip (GET -> edit -> POST) is seamless.
"""
if not self._config:
self._load_config()
# Build nested channels object matching frontend ChannelConfig.
# Any per-channel key that is also in SENSITIVE_KEYS (e.g. telegram.token,
# email.password, discord.webhook_url) is masked with SENSITIVE_PLACEHOLDER
# when populated — so the secret never leaves the server in cleartext just
# because the UI loaded the settings page. The frontend treats the
# placeholder as "unchanged" on save.
channels = {}
for ch_type, info in CHANNEL_TYPES.items():
ch_cfg: Dict[str, Any] = {
'enabled': self._config.get(f'{ch_type}.enabled', 'false') == 'true',
'rich_format': self._config.get(f'{ch_type}.rich_format', 'false') == 'true',
}
for config_key in info['config_keys']:
full_key = f'{ch_type}.{config_key}'
raw = self._config.get(full_key, '')
if full_key in SENSITIVE_KEYS:
ch_cfg[config_key] = _mask_if_set(raw)
else:
ch_cfg[config_key] = raw
channels[ch_type] = ch_cfg
# Build event_types_by_group for UI rendering
event_types_by_group = get_event_types_by_group()
# Build per-channel overrides
# Each channel independently owns its category and event toggles.
# Keys: {channel}.events.{group} and {channel}.event.{event_type}
# Defaults: categories default to true, events default to template default_enabled.
channel_overrides = {}
for ch_type in CHANNEL_TYPES:
ch_overrides = {'categories': {}, 'events': {}}
for group_key in EVENT_GROUPS:
saved = self._config.get(f'{ch_type}.events.{group_key}')
ch_overrides['categories'][group_key] = (saved or 'true') == 'true'
for event_type_key, tmpl in TEMPLATES.items():
default = 'true' if tmpl.get('default_enabled', True) else 'false'
saved = self._config.get(f'{ch_type}.event.{event_type_key}')
ch_overrides['events'][event_type_key] = (saved or default) == 'true'
channel_overrides[ch_type] = ch_overrides
# Build AI detail levels per channel
ai_detail_levels = {}
for ch_type in CHANNEL_TYPES:
ai_detail_levels[ch_type] = self._config.get(f'ai_detail_level_{ch_type}', 'standard')
# Note: Model migration for deprecated models is handled by the periodic
# verify_and_update_ai_model() check. Users now load models dynamically
# from providers using the "Load" button in the UI.
# Capture raw API key state before masking — needed for the legacy
# migration check below, otherwise the mask placeholder would make
# `not ai_api_keys[current_provider]` look populated and skip migration.
current_provider = self._config.get('ai_provider', 'groq')
_raw_keys = {
'groq': self._config.get('ai_api_key_groq', ''),
'gemini': self._config.get('ai_api_key_gemini', ''),
'anthropic': self._config.get('ai_api_key_anthropic', ''),
'openai': self._config.get('ai_api_key_openai', ''),
'openrouter': self._config.get('ai_api_key_openrouter', ''),
}
ai_api_keys = {
'groq': _mask_if_set(_raw_keys['groq']),
'ollama': '', # Ollama doesn't need API key
'gemini': _mask_if_set(_raw_keys['gemini']),
'anthropic': _mask_if_set(_raw_keys['anthropic']),
'openai': _mask_if_set(_raw_keys['openai']),
'openrouter': _mask_if_set(_raw_keys['openrouter']),
}
# Get per-provider selected models
ai_models = {
'groq': self._config.get('ai_model_groq', ''),
'ollama': self._config.get('ai_model_ollama', ''),
'gemini': self._config.get('ai_model_gemini', ''),
'anthropic': self._config.get('ai_model_anthropic', ''),
'openai': self._config.get('ai_model_openai', ''),
'openrouter': self._config.get('ai_model_openrouter', ''),
}
# Migrate legacy ai_api_key to per-provider key if exists. Use the raw
# (unmasked) key state so the check is accurate.
legacy_api_key = self._config.get('ai_api_key', '')
if legacy_api_key and not _raw_keys.get(current_provider, ''):
# Migrate legacy key to current provider — but report it masked.
ai_api_keys[current_provider] = SENSITIVE_PLACEHOLDER
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
cursor = conn.cursor()
# Save migrated key
migrated_key = encrypt_sensitive_value(legacy_api_key) if legacy_api_key else ''
cursor.execute('''
INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at)
VALUES (?, ?, ?)
''', (f'{SETTINGS_PREFIX}ai_api_key_{current_provider}', migrated_key, datetime.now().isoformat()))
# Clear legacy key
cursor.execute('''
DELETE FROM user_settings WHERE setting_key = ?
''', (f'{SETTINGS_PREFIX}ai_api_key',))
conn.commit()
conn.close()
self._config[f'ai_api_key_{current_provider}'] = legacy_api_key
del self._config['ai_api_key']
print(f"[NotificationManager] Migrated legacy API key to {current_provider}")
except Exception as e:
print(f"[NotificationManager] Failed to migrate legacy API key: {e}")
config = {
'enabled': self._enabled,
'channels': channels,
'event_categories': {},
'event_toggles': {},
'event_types_by_group': event_types_by_group,
'channel_overrides': channel_overrides,
'ai_enabled': self._config.get('ai_enabled', 'false') == 'true',
'ai_provider': current_provider,
'ai_api_keys': ai_api_keys,
'ai_models': ai_models,
'ai_model': self._config.get('ai_model', ''),
'ai_language': self._config.get('ai_language', 'en'),
'ai_ollama_url': self._config.get('ai_ollama_url', 'http://localhost:11434'),
'ai_openai_base_url': self._config.get('ai_openai_base_url', ''),
'ai_prompt_mode': self._config.get('ai_prompt_mode', 'default'),
'ai_custom_prompt': self._config.get('ai_custom_prompt', ''),
'ai_allow_suggestions': self._config.get('ai_allow_suggestions', 'false') == 'true',
'ai_detail_levels': ai_detail_levels,
'hostname': self._config.get('hostname', ''),
'webhook_secret': _mask_if_set(self._config.get('webhook_secret', '')),
'webhook_allowed_ips': self._config.get('webhook_allowed_ips', ''),
'pbs_host': self._config.get('pbs_host', ''),
'pve_host': self._config.get('pve_host', ''),
'pbs_trusted_sources': self._config.get('pbs_trusted_sources', ''),
}
return {
'success': True,
'config': config,
}
def save_settings(self, settings: Dict[str, str]) -> Dict[str, Any]:
"""Save multiple notification settings at once."""
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
conn.execute('PRAGMA journal_mode=WAL')
conn.execute('PRAGMA busy_timeout=5000')
cursor = conn.cursor()
now = datetime.now().isoformat()
for key, value in settings.items():
# Accept both prefixed and unprefixed keys
full_key = key if key.startswith(SETTINGS_PREFIX) else f'{SETTINGS_PREFIX}{key}'
short_key = full_key[len(SETTINGS_PREFIX):]
# Skip placeholder writes for sensitive fields. The UI receives a
# masked placeholder for any populated secret on GET; if it sends
# the placeholder back unchanged we must not overwrite the real
# stored value. See audit Tier 2 #17c.
if short_key in SENSITIVE_KEYS and str(value) == SENSITIVE_PLACEHOLDER:
continue
# SSRF guard at the storage boundary. Without this an
# attacker who reaches save_settings could persist a malicious
# `ai_ollama_url` (e.g. file:///etc/shadow or http://127.0.0.1:8006)
# and have it consumed silently in the dispatch path on every
# subsequent notification — even without ever calling test-ai.
# Audit Tier 3 #19.
if short_key == 'ai_ollama_url' and str(value):
ok, err = validate_external_url(str(value), allow_loopback=True)
if not ok:
raise ValueError(f"Invalid ai_ollama_url: {err}")
if short_key == 'ai_openai_base_url' and str(value):
ok, err = validate_external_url(str(value), allow_loopback=False)
if not ok:
raise ValueError(f"Invalid ai_openai_base_url: {err}")
# Cap ai_custom_prompt to a sane size. Without this, a caller
# who reaches save_settings can store a 10 MB prompt that gets
# injected into every AI call: cost amplification, latency
# blow-up, and easier prompt-injection by sheer volume. 8 KB
# is plenty for any reasonable system prompt. Audit Tier 3.2 #3.
if short_key == 'ai_custom_prompt' and isinstance(value, str) and len(value) > 8192:
raise ValueError("ai_custom_prompt exceeds 8 KB cap")
# Whitelist enums that flow into AI prompt template variables
# (detail_level controls instruction phrasing, ai_language picks
# the response language). Without validation a soft prompt
# injection lands in the system prompt verbatim. Audit Tier 3.2 #4.
_ALLOWED_DETAIL_LEVELS = ('brief', 'standard', 'detailed')
_ALLOWED_AI_LANGUAGES = (
'en', 'es', 'fr', 'de', 'it', 'pt', 'ru',
'ja', 'zh', 'ko', 'pl', 'nl', 'tr', 'ar',
)
if short_key.endswith('.ai_detail_level') or short_key == 'ai_detail_level':
if str(value) not in _ALLOWED_DETAIL_LEVELS:
raise ValueError(f"Invalid ai_detail_level: must be one of {_ALLOWED_DETAIL_LEVELS}")
if short_key == 'ai_language':
if str(value) not in _ALLOWED_AI_LANGUAGES:
raise ValueError(f"Invalid ai_language: must be one of {_ALLOWED_AI_LANGUAGES}")
# Encrypt sensitive values before storing. Skip if the value is
# already in either encrypted form — `encrypt_sensitive_value`
# is idempotent on its own input but the explicit check avoids
# a wasted PBKDF2 derivation when round-tripping unchanged values.
store_value = str(value)
if (short_key in SENSITIVE_KEYS and store_value
and not store_value.startswith(_NEW_ENC_PREFIX)
and not store_value.startswith(_LEGACY_ENC_PREFIX)):
store_value = encrypt_sensitive_value(store_value)
cursor.execute('''
INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at)
VALUES (?, ?, ?)
''', (full_key, store_value, now))
# Keep decrypted value in memory for runtime use
self._config[short_key] = str(value)
# If user is explicitly enabling an event that defaults to disabled,
# mark it so _load_config reconciliation won't override it later.
if short_key.startswith('event.') and str(value) == 'true':
event_type = short_key[6:] # strip 'event.'
tmpl = TEMPLATES.get(event_type, {})
if not tmpl.get('default_enabled', True):
marker_key = f'{SETTINGS_PREFIX}event_explicit.{event_type}'
cursor.execute('''
INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at)
VALUES (?, ?, ?)
''', (marker_key, 'true', now))
self._config[f'event_explicit.{event_type}'] = 'true'
conn.commit()
conn.close()
# Rebuild channels with new config
was_enabled = self._enabled
self._enabled = self._config.get('enabled', 'false') == 'true'
self._rebuild_channels()
# Start/stop service and auto-configure PVE webhook
pve_webhook_result = None
if self._enabled and not was_enabled:
# Notifications just got ENABLED -> start service + setup PVE webhook
if not self._running:
self.start()
try:
from flask_notification_routes import setup_pve_webhook_core
pve_webhook_result = setup_pve_webhook_core()
except ImportError:
pass # flask_notification_routes not available (CLI mode)
except Exception as e:
pve_webhook_result = {'configured': False, 'error': str(e)}
elif not self._enabled and was_enabled:
# Notifications just got DISABLED -> stop service + cleanup PVE webhook
if self._running:
self.stop()
try:
from flask_notification_routes import cleanup_pve_webhook_core
cleanup_pve_webhook_core()
except ImportError:
pass
except Exception:
pass
result = {'success': True, 'channels_active': list(self._channels.keys())}
if pve_webhook_result:
result['pve_webhook'] = pve_webhook_result
return result
except Exception as e:
return {'success': False, 'error': str(e)}
def verify_and_update_ai_model(self) -> Dict[str, Any]:
"""Verify current AI model is available, update if deprecated.
This method checks if the configured AI model is still available
from the provider. If not, it automatically migrates to the best
available fallback model and notifies the administrator.
Should be called periodically (e.g., every 24 hours) to catch
model deprecations before they cause notification failures.
Returns:
Dict with:
- checked: bool - whether check was performed
- migrated: bool - whether model was changed
- old_model: str - previous model (if migrated)
- new_model: str - current/new model
- message: str - status message
"""
if self._config.get('ai_enabled', 'false') != 'true':
return {'checked': False, 'migrated': False, 'message': 'AI not enabled'}
provider_name = self._config.get('ai_provider', 'groq')
current_model = self._config.get('ai_model', '')
# Skip Ollama - user manages their own models
if provider_name == 'ollama':
return {'checked': False, 'migrated': False, 'message': 'Ollama models managed locally'}
# Get the API key for this provider
api_key = self._config.get(f'ai_api_key_{provider_name}', '') or self._config.get('ai_api_key', '')
if not api_key:
return {'checked': False, 'migrated': False, 'message': 'No API key configured'}
try:
# Load verified models from config
verified_models = []
recommended_model = ''
try:
# Try AppImage path first (scripts and config both in /usr/bin/)
script_dir = Path(__file__).parent
config_path = script_dir / 'config' / 'verified_ai_models.json'
if not config_path.exists():
# Try development path (AppImage/scripts/ -> AppImage/config/)
config_path = script_dir.parent / 'config' / 'verified_ai_models.json'
if config_path.exists():
with open(config_path, 'r') as f:
verified_config = json.load(f)
provider_config = verified_config.get(provider_name, {})
verified_models = provider_config.get('models', [])
recommended_model = provider_config.get('recommended', '')
except Exception as e:
print(f"[NotificationManager] Failed to load verified models: {e}")
from ai_providers import get_provider
provider = get_provider(provider_name, api_key=api_key, model=current_model)
if not provider:
return {'checked': False, 'migrated': False, 'message': f'Unknown provider: {provider_name}'}
# Get available models from API
api_models = provider.list_models()
# Combine: use verified models that are also in API (or all verified if API fails)
if api_models and verified_models:
available_models = [m for m in verified_models if m in api_models]
elif verified_models:
available_models = verified_models
elif api_models:
available_models = api_models
else:
return {'checked': True, 'migrated': False, 'message': 'Could not retrieve model list'}
# Check if current model is available
if current_model in available_models:
return {
'checked': True,
'migrated': False,
'new_model': current_model,
'message': f'Model {current_model} is available'
}
# Model not available - use recommended or first available
recommended = recommended_model if recommended_model in available_models else (available_models[0] if available_models else '')
if not recommended or recommended == current_model:
return {
'checked': True,
'migrated': False,
'new_model': current_model,
'message': f'Model {current_model} not in list but no alternative found'
}
# Migrate to new model
old_model = current_model
try:
conn = sqlite3.connect(str(DB_PATH), timeout=10)
cursor = conn.cursor()
cursor.execute('''
INSERT OR REPLACE INTO user_settings (setting_key, setting_value, updated_at)
VALUES (?, ?, ?)
''', (f'{SETTINGS_PREFIX}ai_model', recommended, datetime.now().isoformat()))
conn.commit()
conn.close()
self._config['ai_model'] = recommended
print(f"[NotificationManager] AI model migrated: {old_model} -> {recommended}")
return {
'checked': True,
'migrated': True,
'old_model': old_model,
'new_model': recommended,
'message': f'Model migrated from {old_model} to {recommended}'
}
except Exception as e:
return {
'checked': True,
'migrated': False,
'message': f'Failed to save new model: {e}'
}
except Exception as e:
print(f"[NotificationManager] Model verification failed: {e}")
return {'checked': False, 'migrated': False, 'message': str(e)}
# ─── Singleton (for server mode) ─────────────────────────────────
notification_manager = NotificationManager()
# ─── CLI Interface ────────────────────────────────────────────────
def _print_result(result: Dict, as_json: bool):
"""Print CLI result in human-readable or JSON format."""
if as_json:
print(json.dumps(result, indent=2, default=str))
return
if result.get('success'):
print(f"OK: ", end='')
elif 'success' in result and not result['success']:
print(f"ERROR: ", end='')
# Format based on content
if 'channels_sent' in result:
sent = result.get('channels_sent', [])
print(f"Sent via: {', '.join(sent) if sent else 'none'}")
if result.get('errors'):
for err in result['errors']:
print(f" Error: {err}")
elif 'results' in result:
for ch, r in result['results'].items():
status = 'OK' if r['success'] else f"FAILED: {r['error']}"
print(f" {ch}: {status}")
elif 'channels' in result:
for ch, info in result['channels'].items():
status = 'active' if info.get('active') else ('configured' if info.get('configured') else 'not configured')
enabled = 'enabled' if info.get('enabled') else 'disabled'
print(f" {info['name']}: {enabled}, {status}")
elif 'enabled' in result and 'running' in result:
print(f"Enabled: {result['enabled']}, Running: {result['running']}")
if result.get('stats'):
stats = result['stats']
print(f" Total sent: {stats.get('total_sent', 0)}")
print(f" Total errors: {stats.get('total_errors', 0)}")
if stats.get('last_sent_at'):
print(f" Last sent: {stats['last_sent_at']}")
elif 'enabled' in result:
print(f"Service {'enabled' if result['enabled'] else 'disabled'}")
else:
print(json.dumps(result, indent=2, default=str))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description='ProxMenux Notification Manager CLI',
epilog='Example: python3 notification_manager.py --action send --type vm_fail --severity CRITICAL --title "VM 100 failed" --message "QEMU process crashed"'
)
parser.add_argument('--action', required=True,
choices=['send', 'send-raw', 'test', 'status',
'enable', 'disable', 'list-channels'],
help='Action to perform')
parser.add_argument('--type', help='Event type for send action (e.g. vm_fail, backup_complete)')
parser.add_argument('--severity', default='INFO',
choices=['INFO', 'WARNING', 'CRITICAL'],
help='Notification severity (default: INFO)')
parser.add_argument('--title', help='Notification title')
parser.add_argument('--message', help='Notification message body')
parser.add_argument('--channel', default='all',
help='Specific channel for test (default: all)')
parser.add_argument('--json', action='store_true',
help='Output result as JSON')
args = parser.parse_args()
mgr = NotificationManager()
mgr._load_config()
if args.action == 'send':
if not args.type:
parser.error('--type is required for send action')
result = mgr.send_notification(
args.type, args.severity,
args.title or '', args.message or '',
data={
'hostname': _resolve_display_hostname(mgr._config),
'reason': args.message or '',
},
source='cli'
)
elif args.action == 'send-raw':
if not args.title or not args.message:
parser.error('--title and --message are required for send-raw')
result = mgr.send_raw(args.title, args.message, args.severity, source='cli')
elif args.action == 'test':
result = mgr.test_channel(args.channel)
elif args.action == 'status':
result = mgr.get_status()
elif args.action == 'enable':
result = mgr.set_enabled(True)
elif args.action == 'disable':
result = mgr.set_enabled(False)
elif args.action == 'list-channels':
result = mgr.list_channels()
else:
result = {'error': f'Unknown action: {args.action}'}
_print_result(result, args.json)
# Exit with appropriate code
sys.exit(0 if result.get('success', True) else 1)