Update notification service

2026-05-23 09:04:44 +00:00 · 2026-03-25 19:21:37 +01:00
parent 2241b125d6
commit 6c2b03ae76
5 changed files with 144 additions and 10 deletions
@@ -99,6 +99,8 @@ cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null ||
 cp "$SCRIPT_DIR/oci_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  oci_manager.py not found"
 cp "$SCRIPT_DIR/flask_oci_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  flask_oci_routes.py not found"
 cp "$SCRIPT_DIR/oci/description_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  description_templates.py not found"
+cp "$SCRIPT_DIR/shutdown-notify.sh" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  shutdown-notify.sh not found"
+chmod +x "$APP_DIR/usr/bin/shutdown-notify.sh" 2>/dev/null || true

 # Copy AI providers module for notification enhancement
 echo "📋 Copying AI providers module..."
@@ -951,3 +951,46 @@ def proxmox_webhook():
    except Exception as e:
        # Still return 200 to avoid PVE flagging the webhook as broken
        return jsonify({'accepted': False, 'error': 'internal_error', 'detail': str(e)}), 200
+
+
+# ─── Internal Shutdown Event Endpoint ─────────────────────────────
+
+@notification_bp.route('/api/internal/shutdown-event', methods=['POST'])
+def internal_shutdown_event():
+    """
+    Internal endpoint called by systemd ExecStop script to emit shutdown/reboot notification.
+    This allows the service to send a notification BEFORE it terminates.
+    
+    Only accepts requests from localhost (127.0.0.1) for security.
+    """
+    # Security: Only allow localhost
+    remote_addr = request.remote_addr
+    if remote_addr not in ('127.0.0.1', '::1', 'localhost'):
+        return jsonify({'error': 'forbidden', 'detail': 'localhost only'}), 403
+    
+    try:
+        data = request.get_json(silent=True) or {}
+        event_type = data.get('event_type', 'system_shutdown')
+        hostname = data.get('hostname', 'unknown')
+        reason = data.get('reason', 'System is shutting down.')
+        
+        # Validate event type
+        if event_type not in ('system_shutdown', 'system_reboot'):
+            return jsonify({'error': 'invalid_event_type'}), 400
+        
+        # Emit the notification directly through notification_manager
+        notification_manager.emit_event(
+            event_type=event_type,
+            severity='INFO',
+            data={
+                'hostname': hostname,
+                'reason': reason,
+            },
+            source='systemd',
+            entity='node',
+            entity_id='',
+        )
+        
+        return jsonify({'success': True, 'event_type': event_type}), 200
+    except Exception as e:
+        return jsonify({'error': 'internal_error', 'detail': str(e)}), 500
@@ -34,13 +34,18 @@ class _SharedState:
    Used to coordinate behavior when host-level events affect VM/CT events:
    - Suppress vm_stop/ct_stop during host shutdown (they're expected)
    - Aggregate vm_start/ct_start during startup into single message
+    
+    Two separate grace periods:
+    - startup_vm_grace: Time to aggregate VM/CT starts (shorter, 2 min)
+    - startup_health_grace: Time to suppress transient health errors (longer, 3 min)
    """
    def __init__(self):
        self._lock = threading.Lock()
        self._shutdown_time: float = 0  # timestamp when shutdown was detected
        self._shutdown_grace = 120  # suppress VM/CT stops for 2 minutes after shutdown detected
        self._startup_time: float = time.time()  # when module was loaded (service start)
-        self._startup_grace = 300  # aggregate VM/CT starts for 5 minutes after startup
+        self._startup_vm_grace = 120  # aggregate VM/CT starts for 2 minutes after startup
+        self._startup_health_grace = 180  # suppress health warnings for 3 minutes after startup
        self._startup_vms: list = []  # [(vmid, vmname, 'vm'|'ct'), ...]
        self._startup_aggregated = False  # have we already sent the aggregated message?
    
@@ -57,9 +62,18 @@ class _SharedState:
            return (time.time() - self._shutdown_time) < self._shutdown_grace
    
    def is_startup_period(self) -> bool:
-        """Check if we're within the startup aggregation period."""
+        """Check if we're within the startup VM aggregation period (2 min)."""
        with self._lock:
-            return (time.time() - self._startup_time) < self._startup_grace
+            return (time.time() - self._startup_time) < self._startup_vm_grace
+    
+    def is_startup_health_grace(self) -> bool:
+        """Check if we're within the startup health grace period (3 min).
+        
+        Used by PollingCollector to suppress transient health warnings
+        (QMP timeout, storage not ready, etc.) during system boot.
+        """
+        with self._lock:
+            return (time.time() - self._startup_time) < self._startup_health_grace
    
    def add_startup_vm(self, vmid: str, vmname: str, vm_type: str):
        """Record a VM/CT start during startup period for later aggregation."""
@@ -1769,7 +1783,6 @@ class PollingCollector:
        # Dict[error_key, dict(category, severity, reason, first_seen, error_key)]
        self._known_errors: Dict[str, dict] = {}
        self._first_poll_done = False
-        self._startup_time = time.time()  # Track when service started
    
    def start(self):
        if self._running:
@@ -1792,10 +1805,8 @@ class PollingCollector:
    
    # ── Main loop ──────────────────────────────────────────────
    
-    # Startup grace period: ignore transient errors from certain categories
-    # during the first N seconds after service start.  Remote services like
-    # PBS storage, VMs with qemu-guest-agent, etc. may take time to connect.
-    STARTUP_GRACE_PERIOD = 180  # 3 minutes
+    # Categories where transient errors are suppressed during startup grace period.
+    # PBS storage, NFS mounts, VMs with qemu-guest-agent need time after boot.
    STARTUP_GRACE_CATEGORIES = {'storage', 'vms', 'network', 'pve_services'}
    
    def _poll_loop(self):
@@ -1907,8 +1918,8 @@ class PollingCollector:
            # Startup grace period: ignore transient errors from categories that
            # typically need time to stabilize after boot (storage, VMs, network).
            # PBS storage, NFS mounts, VMs with qemu-guest-agent need time to connect.
-            time_since_startup = now - self._startup_time
-            if time_since_startup < self.STARTUP_GRACE_PERIOD:
+            # Uses the shared state so grace period is consistent across all watchers.
+            if _shared_state.is_startup_health_grace():
                if category in self.STARTUP_GRACE_CATEGORIES:
                    # Still within grace period for this category - skip notification
                    continue
@@ -1015,6 +1015,37 @@ class NotificationManager:
    
    # ─── Public API (used by Flask routes and CLI) ──────────────
    
+    def emit_event(self, event_type: str, severity: str, data: Dict,
+                   source: str = 'api', entity: str = 'node', entity_id: str = '') -> Dict[str, Any]:
+        """Emit an event through the notification system.
+        
+        This creates a NotificationEvent and processes it through the normal pipeline,
+        including toggle checks, template rendering, and cooldown.
+        
+        Used by internal endpoints like the shutdown notification hook.
+        
+        Args:
+            event_type: Type of event (must match TEMPLATES key)
+            severity: INFO, WARNING, CRITICAL
+            data: Event data for template rendering
+            source: Origin of event
+            entity: Entity type (node, vm, ct, storage, etc.)
+            entity_id: Entity identifier
+        """
+        from notification_events import NotificationEvent
+        
+        event = NotificationEvent(
+            event_type=event_type,
+            severity=severity,
+            data=data,
+            source=source,
+            entity=entity,
+            entity_id=entity_id,
+        )
+        
+        # Process the event through the normal pipeline
+        return self._process_single_event(event)
+    
    def send_notification(self, event_type: str, severity: str,
                          title: str, message: str,
                          data: Optional[Dict] = None,
@@ -0,0 +1,47 @@
+#!/bin/bash
+# ProxMenux Monitor - Shutdown Notification Script
+# This script is called by systemd ExecStop before the service terminates.
+# It sends a shutdown/reboot notification via the running Flask server.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CONFIG_DIR="/var/lib/proxmenux"
+CONFIG_FILE="$CONFIG_DIR/config.json"
+PORT="${PORT:-5000}"
+
+# Determine if this is a reboot or shutdown
+# Check for systemd targets or runlevel
+is_reboot=false
+if systemctl is-active --quiet reboot.target 2>/dev/null; then
+    is_reboot=true
+elif [ -f /run/systemd/shutdown/scheduled ]; then
+    if grep -q "reboot" /run/systemd/shutdown/scheduled 2>/dev/null; then
+        is_reboot=true
+    fi
+fi
+
+# Build the event type and message
+if [ "$is_reboot" = true ]; then
+    event_type="system_reboot"
+    reason="The system is rebooting."
+else
+    event_type="system_shutdown"
+    reason="The system is shutting down."
+fi
+
+hostname=$(hostname)
+
+# Try to send notification via internal API endpoint
+# The Flask server may still be running at this point
+curl -s -X POST "http://127.0.0.1:$PORT/api/internal/shutdown-event" \
+    -H "Content-Type: application/json" \
+    -d "{\"event_type\": \"$event_type\", \"hostname\": \"$hostname\", \"reason\": \"$reason\"}" \
+    --max-time 5 2>/dev/null || true
+
+# Give the notification a moment to be sent
+sleep 2
+
+# Now terminate the Flask process
+# Find the main process and send SIGTERM
+pkill -TERM -f "proxmenux-monitor" 2>/dev/null || true
+
+exit 0