diff --git a/AppImage/scripts/build_appimage.sh b/AppImage/scripts/build_appimage.sh index 0f540547..585a472c 100644 --- a/AppImage/scripts/build_appimage.sh +++ b/AppImage/scripts/build_appimage.sh @@ -99,6 +99,8 @@ cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || cp "$SCRIPT_DIR/oci_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ oci_manager.py not found" cp "$SCRIPT_DIR/flask_oci_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_oci_routes.py not found" cp "$SCRIPT_DIR/oci/description_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ description_templates.py not found" +cp "$SCRIPT_DIR/shutdown-notify.sh" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ shutdown-notify.sh not found" +chmod +x "$APP_DIR/usr/bin/shutdown-notify.sh" 2>/dev/null || true # Copy AI providers module for notification enhancement echo "📋 Copying AI providers module..." diff --git a/AppImage/scripts/flask_notification_routes.py b/AppImage/scripts/flask_notification_routes.py index 8261d97d..7c3294b7 100644 --- a/AppImage/scripts/flask_notification_routes.py +++ b/AppImage/scripts/flask_notification_routes.py @@ -951,3 +951,46 @@ def proxmox_webhook(): except Exception as e: # Still return 200 to avoid PVE flagging the webhook as broken return jsonify({'accepted': False, 'error': 'internal_error', 'detail': str(e)}), 200 + + +# ─── Internal Shutdown Event Endpoint ───────────────────────────── + +@notification_bp.route('/api/internal/shutdown-event', methods=['POST']) +def internal_shutdown_event(): + """ + Internal endpoint called by systemd ExecStop script to emit shutdown/reboot notification. + This allows the service to send a notification BEFORE it terminates. + + Only accepts requests from localhost (127.0.0.1) for security. + """ + # Security: Only allow localhost + remote_addr = request.remote_addr + if remote_addr not in ('127.0.0.1', '::1', 'localhost'): + return jsonify({'error': 'forbidden', 'detail': 'localhost only'}), 403 + + try: + data = request.get_json(silent=True) or {} + event_type = data.get('event_type', 'system_shutdown') + hostname = data.get('hostname', 'unknown') + reason = data.get('reason', 'System is shutting down.') + + # Validate event type + if event_type not in ('system_shutdown', 'system_reboot'): + return jsonify({'error': 'invalid_event_type'}), 400 + + # Emit the notification directly through notification_manager + notification_manager.emit_event( + event_type=event_type, + severity='INFO', + data={ + 'hostname': hostname, + 'reason': reason, + }, + source='systemd', + entity='node', + entity_id='', + ) + + return jsonify({'success': True, 'event_type': event_type}), 200 + except Exception as e: + return jsonify({'error': 'internal_error', 'detail': str(e)}), 500 diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index a68a1c5a..5bcd2aa6 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -34,13 +34,18 @@ class _SharedState: Used to coordinate behavior when host-level events affect VM/CT events: - Suppress vm_stop/ct_stop during host shutdown (they're expected) - Aggregate vm_start/ct_start during startup into single message + + Two separate grace periods: + - startup_vm_grace: Time to aggregate VM/CT starts (shorter, 2 min) + - startup_health_grace: Time to suppress transient health errors (longer, 3 min) """ def __init__(self): self._lock = threading.Lock() self._shutdown_time: float = 0 # timestamp when shutdown was detected self._shutdown_grace = 120 # suppress VM/CT stops for 2 minutes after shutdown detected self._startup_time: float = time.time() # when module was loaded (service start) - self._startup_grace = 300 # aggregate VM/CT starts for 5 minutes after startup + self._startup_vm_grace = 120 # aggregate VM/CT starts for 2 minutes after startup + self._startup_health_grace = 180 # suppress health warnings for 3 minutes after startup self._startup_vms: list = [] # [(vmid, vmname, 'vm'|'ct'), ...] self._startup_aggregated = False # have we already sent the aggregated message? @@ -57,9 +62,18 @@ class _SharedState: return (time.time() - self._shutdown_time) < self._shutdown_grace def is_startup_period(self) -> bool: - """Check if we're within the startup aggregation period.""" + """Check if we're within the startup VM aggregation period (2 min).""" with self._lock: - return (time.time() - self._startup_time) < self._startup_grace + return (time.time() - self._startup_time) < self._startup_vm_grace + + def is_startup_health_grace(self) -> bool: + """Check if we're within the startup health grace period (3 min). + + Used by PollingCollector to suppress transient health warnings + (QMP timeout, storage not ready, etc.) during system boot. + """ + with self._lock: + return (time.time() - self._startup_time) < self._startup_health_grace def add_startup_vm(self, vmid: str, vmname: str, vm_type: str): """Record a VM/CT start during startup period for later aggregation.""" @@ -1769,7 +1783,6 @@ class PollingCollector: # Dict[error_key, dict(category, severity, reason, first_seen, error_key)] self._known_errors: Dict[str, dict] = {} self._first_poll_done = False - self._startup_time = time.time() # Track when service started def start(self): if self._running: @@ -1792,10 +1805,8 @@ class PollingCollector: # ── Main loop ────────────────────────────────────────────── - # Startup grace period: ignore transient errors from certain categories - # during the first N seconds after service start. Remote services like - # PBS storage, VMs with qemu-guest-agent, etc. may take time to connect. - STARTUP_GRACE_PERIOD = 180 # 3 minutes + # Categories where transient errors are suppressed during startup grace period. + # PBS storage, NFS mounts, VMs with qemu-guest-agent need time after boot. STARTUP_GRACE_CATEGORIES = {'storage', 'vms', 'network', 'pve_services'} def _poll_loop(self): @@ -1907,8 +1918,8 @@ class PollingCollector: # Startup grace period: ignore transient errors from categories that # typically need time to stabilize after boot (storage, VMs, network). # PBS storage, NFS mounts, VMs with qemu-guest-agent need time to connect. - time_since_startup = now - self._startup_time - if time_since_startup < self.STARTUP_GRACE_PERIOD: + # Uses the shared state so grace period is consistent across all watchers. + if _shared_state.is_startup_health_grace(): if category in self.STARTUP_GRACE_CATEGORIES: # Still within grace period for this category - skip notification continue diff --git a/AppImage/scripts/notification_manager.py b/AppImage/scripts/notification_manager.py index 837c95ab..4b2bb46d 100644 --- a/AppImage/scripts/notification_manager.py +++ b/AppImage/scripts/notification_manager.py @@ -1015,6 +1015,37 @@ class NotificationManager: # ─── Public API (used by Flask routes and CLI) ────────────── + def emit_event(self, event_type: str, severity: str, data: Dict, + source: str = 'api', entity: str = 'node', entity_id: str = '') -> Dict[str, Any]: + """Emit an event through the notification system. + + This creates a NotificationEvent and processes it through the normal pipeline, + including toggle checks, template rendering, and cooldown. + + Used by internal endpoints like the shutdown notification hook. + + Args: + event_type: Type of event (must match TEMPLATES key) + severity: INFO, WARNING, CRITICAL + data: Event data for template rendering + source: Origin of event + entity: Entity type (node, vm, ct, storage, etc.) + entity_id: Entity identifier + """ + from notification_events import NotificationEvent + + event = NotificationEvent( + event_type=event_type, + severity=severity, + data=data, + source=source, + entity=entity, + entity_id=entity_id, + ) + + # Process the event through the normal pipeline + return self._process_single_event(event) + def send_notification(self, event_type: str, severity: str, title: str, message: str, data: Optional[Dict] = None, diff --git a/AppImage/scripts/shutdown-notify.sh b/AppImage/scripts/shutdown-notify.sh new file mode 100644 index 00000000..7fd9674f --- /dev/null +++ b/AppImage/scripts/shutdown-notify.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# ProxMenux Monitor - Shutdown Notification Script +# This script is called by systemd ExecStop before the service terminates. +# It sends a shutdown/reboot notification via the running Flask server. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CONFIG_DIR="/var/lib/proxmenux" +CONFIG_FILE="$CONFIG_DIR/config.json" +PORT="${PORT:-5000}" + +# Determine if this is a reboot or shutdown +# Check for systemd targets or runlevel +is_reboot=false +if systemctl is-active --quiet reboot.target 2>/dev/null; then + is_reboot=true +elif [ -f /run/systemd/shutdown/scheduled ]; then + if grep -q "reboot" /run/systemd/shutdown/scheduled 2>/dev/null; then + is_reboot=true + fi +fi + +# Build the event type and message +if [ "$is_reboot" = true ]; then + event_type="system_reboot" + reason="The system is rebooting." +else + event_type="system_shutdown" + reason="The system is shutting down." +fi + +hostname=$(hostname) + +# Try to send notification via internal API endpoint +# The Flask server may still be running at this point +curl -s -X POST "http://127.0.0.1:$PORT/api/internal/shutdown-event" \ + -H "Content-Type: application/json" \ + -d "{\"event_type\": \"$event_type\", \"hostname\": \"$hostname\", \"reason\": \"$reason\"}" \ + --max-time 5 2>/dev/null || true + +# Give the notification a moment to be sent +sleep 2 + +# Now terminate the Flask process +# Find the main process and send SIGTERM +pkill -TERM -f "proxmenux-monitor" 2>/dev/null || true + +exit 0