Update notification service

This commit is contained in:
MacRimi
2026-03-25 19:21:37 +01:00
parent 2241b125d6
commit 6c2b03ae76
5 changed files with 144 additions and 10 deletions

View File

@@ -99,6 +99,8 @@ cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null ||
cp "$SCRIPT_DIR/oci_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ oci_manager.py not found"
cp "$SCRIPT_DIR/flask_oci_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_oci_routes.py not found"
cp "$SCRIPT_DIR/oci/description_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ description_templates.py not found"
cp "$SCRIPT_DIR/shutdown-notify.sh" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ shutdown-notify.sh not found"
chmod +x "$APP_DIR/usr/bin/shutdown-notify.sh" 2>/dev/null || true
# Copy AI providers module for notification enhancement
echo "📋 Copying AI providers module..."

View File

@@ -951,3 +951,46 @@ def proxmox_webhook():
except Exception as e:
# Still return 200 to avoid PVE flagging the webhook as broken
return jsonify({'accepted': False, 'error': 'internal_error', 'detail': str(e)}), 200
# ─── Internal Shutdown Event Endpoint ─────────────────────────────
@notification_bp.route('/api/internal/shutdown-event', methods=['POST'])
def internal_shutdown_event():
"""
Internal endpoint called by systemd ExecStop script to emit shutdown/reboot notification.
This allows the service to send a notification BEFORE it terminates.
Only accepts requests from localhost (127.0.0.1) for security.
"""
# Security: Only allow localhost
remote_addr = request.remote_addr
if remote_addr not in ('127.0.0.1', '::1', 'localhost'):
return jsonify({'error': 'forbidden', 'detail': 'localhost only'}), 403
try:
data = request.get_json(silent=True) or {}
event_type = data.get('event_type', 'system_shutdown')
hostname = data.get('hostname', 'unknown')
reason = data.get('reason', 'System is shutting down.')
# Validate event type
if event_type not in ('system_shutdown', 'system_reboot'):
return jsonify({'error': 'invalid_event_type'}), 400
# Emit the notification directly through notification_manager
notification_manager.emit_event(
event_type=event_type,
severity='INFO',
data={
'hostname': hostname,
'reason': reason,
},
source='systemd',
entity='node',
entity_id='',
)
return jsonify({'success': True, 'event_type': event_type}), 200
except Exception as e:
return jsonify({'error': 'internal_error', 'detail': str(e)}), 500

View File

@@ -34,13 +34,18 @@ class _SharedState:
Used to coordinate behavior when host-level events affect VM/CT events:
- Suppress vm_stop/ct_stop during host shutdown (they're expected)
- Aggregate vm_start/ct_start during startup into single message
Two separate grace periods:
- startup_vm_grace: Time to aggregate VM/CT starts (shorter, 2 min)
- startup_health_grace: Time to suppress transient health errors (longer, 3 min)
"""
def __init__(self):
self._lock = threading.Lock()
self._shutdown_time: float = 0 # timestamp when shutdown was detected
self._shutdown_grace = 120 # suppress VM/CT stops for 2 minutes after shutdown detected
self._startup_time: float = time.time() # when module was loaded (service start)
self._startup_grace = 300 # aggregate VM/CT starts for 5 minutes after startup
self._startup_vm_grace = 120 # aggregate VM/CT starts for 2 minutes after startup
self._startup_health_grace = 180 # suppress health warnings for 3 minutes after startup
self._startup_vms: list = [] # [(vmid, vmname, 'vm'|'ct'), ...]
self._startup_aggregated = False # have we already sent the aggregated message?
@@ -57,9 +62,18 @@ class _SharedState:
return (time.time() - self._shutdown_time) < self._shutdown_grace
def is_startup_period(self) -> bool:
"""Check if we're within the startup aggregation period."""
"""Check if we're within the startup VM aggregation period (2 min)."""
with self._lock:
return (time.time() - self._startup_time) < self._startup_grace
return (time.time() - self._startup_time) < self._startup_vm_grace
def is_startup_health_grace(self) -> bool:
"""Check if we're within the startup health grace period (3 min).
Used by PollingCollector to suppress transient health warnings
(QMP timeout, storage not ready, etc.) during system boot.
"""
with self._lock:
return (time.time() - self._startup_time) < self._startup_health_grace
def add_startup_vm(self, vmid: str, vmname: str, vm_type: str):
"""Record a VM/CT start during startup period for later aggregation."""
@@ -1769,7 +1783,6 @@ class PollingCollector:
# Dict[error_key, dict(category, severity, reason, first_seen, error_key)]
self._known_errors: Dict[str, dict] = {}
self._first_poll_done = False
self._startup_time = time.time() # Track when service started
def start(self):
if self._running:
@@ -1792,10 +1805,8 @@ class PollingCollector:
# ── Main loop ──────────────────────────────────────────────
# Startup grace period: ignore transient errors from certain categories
# during the first N seconds after service start. Remote services like
# PBS storage, VMs with qemu-guest-agent, etc. may take time to connect.
STARTUP_GRACE_PERIOD = 180 # 3 minutes
# Categories where transient errors are suppressed during startup grace period.
# PBS storage, NFS mounts, VMs with qemu-guest-agent need time after boot.
STARTUP_GRACE_CATEGORIES = {'storage', 'vms', 'network', 'pve_services'}
def _poll_loop(self):
@@ -1907,8 +1918,8 @@ class PollingCollector:
# Startup grace period: ignore transient errors from categories that
# typically need time to stabilize after boot (storage, VMs, network).
# PBS storage, NFS mounts, VMs with qemu-guest-agent need time to connect.
time_since_startup = now - self._startup_time
if time_since_startup < self.STARTUP_GRACE_PERIOD:
# Uses the shared state so grace period is consistent across all watchers.
if _shared_state.is_startup_health_grace():
if category in self.STARTUP_GRACE_CATEGORIES:
# Still within grace period for this category - skip notification
continue

View File

@@ -1015,6 +1015,37 @@ class NotificationManager:
# ─── Public API (used by Flask routes and CLI) ──────────────
def emit_event(self, event_type: str, severity: str, data: Dict,
source: str = 'api', entity: str = 'node', entity_id: str = '') -> Dict[str, Any]:
"""Emit an event through the notification system.
This creates a NotificationEvent and processes it through the normal pipeline,
including toggle checks, template rendering, and cooldown.
Used by internal endpoints like the shutdown notification hook.
Args:
event_type: Type of event (must match TEMPLATES key)
severity: INFO, WARNING, CRITICAL
data: Event data for template rendering
source: Origin of event
entity: Entity type (node, vm, ct, storage, etc.)
entity_id: Entity identifier
"""
from notification_events import NotificationEvent
event = NotificationEvent(
event_type=event_type,
severity=severity,
data=data,
source=source,
entity=entity,
entity_id=entity_id,
)
# Process the event through the normal pipeline
return self._process_single_event(event)
def send_notification(self, event_type: str, severity: str,
title: str, message: str,
data: Optional[Dict] = None,

View File

@@ -0,0 +1,47 @@
#!/bin/bash
# ProxMenux Monitor - Shutdown Notification Script
# This script is called by systemd ExecStop before the service terminates.
# It sends a shutdown/reboot notification via the running Flask server.
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
CONFIG_DIR="/var/lib/proxmenux"
CONFIG_FILE="$CONFIG_DIR/config.json"
PORT="${PORT:-5000}"
# Determine if this is a reboot or shutdown
# Check for systemd targets or runlevel
is_reboot=false
if systemctl is-active --quiet reboot.target 2>/dev/null; then
is_reboot=true
elif [ -f /run/systemd/shutdown/scheduled ]; then
if grep -q "reboot" /run/systemd/shutdown/scheduled 2>/dev/null; then
is_reboot=true
fi
fi
# Build the event type and message
if [ "$is_reboot" = true ]; then
event_type="system_reboot"
reason="The system is rebooting."
else
event_type="system_shutdown"
reason="The system is shutting down."
fi
hostname=$(hostname)
# Try to send notification via internal API endpoint
# The Flask server may still be running at this point
curl -s -X POST "http://127.0.0.1:$PORT/api/internal/shutdown-event" \
-H "Content-Type: application/json" \
-d "{\"event_type\": \"$event_type\", \"hostname\": \"$hostname\", \"reason\": \"$reason\"}" \
--max-time 5 2>/dev/null || true
# Give the notification a moment to be sent
sleep 2
# Now terminate the Flask process
# Find the main process and send SIGTERM
pkill -TERM -f "proxmenux-monitor" 2>/dev/null || true
exit 0