mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-05 20:03:48 +00:00
Update notification service
This commit is contained in:
@@ -99,6 +99,8 @@ cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null ||
|
||||
cp "$SCRIPT_DIR/oci_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ oci_manager.py not found"
|
||||
cp "$SCRIPT_DIR/flask_oci_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_oci_routes.py not found"
|
||||
cp "$SCRIPT_DIR/oci/description_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ description_templates.py not found"
|
||||
cp "$SCRIPT_DIR/shutdown-notify.sh" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ shutdown-notify.sh not found"
|
||||
chmod +x "$APP_DIR/usr/bin/shutdown-notify.sh" 2>/dev/null || true
|
||||
|
||||
# Copy AI providers module for notification enhancement
|
||||
echo "📋 Copying AI providers module..."
|
||||
|
||||
@@ -951,3 +951,46 @@ def proxmox_webhook():
|
||||
except Exception as e:
|
||||
# Still return 200 to avoid PVE flagging the webhook as broken
|
||||
return jsonify({'accepted': False, 'error': 'internal_error', 'detail': str(e)}), 200
|
||||
|
||||
|
||||
# ─── Internal Shutdown Event Endpoint ─────────────────────────────
|
||||
|
||||
@notification_bp.route('/api/internal/shutdown-event', methods=['POST'])
|
||||
def internal_shutdown_event():
|
||||
"""
|
||||
Internal endpoint called by systemd ExecStop script to emit shutdown/reboot notification.
|
||||
This allows the service to send a notification BEFORE it terminates.
|
||||
|
||||
Only accepts requests from localhost (127.0.0.1) for security.
|
||||
"""
|
||||
# Security: Only allow localhost
|
||||
remote_addr = request.remote_addr
|
||||
if remote_addr not in ('127.0.0.1', '::1', 'localhost'):
|
||||
return jsonify({'error': 'forbidden', 'detail': 'localhost only'}), 403
|
||||
|
||||
try:
|
||||
data = request.get_json(silent=True) or {}
|
||||
event_type = data.get('event_type', 'system_shutdown')
|
||||
hostname = data.get('hostname', 'unknown')
|
||||
reason = data.get('reason', 'System is shutting down.')
|
||||
|
||||
# Validate event type
|
||||
if event_type not in ('system_shutdown', 'system_reboot'):
|
||||
return jsonify({'error': 'invalid_event_type'}), 400
|
||||
|
||||
# Emit the notification directly through notification_manager
|
||||
notification_manager.emit_event(
|
||||
event_type=event_type,
|
||||
severity='INFO',
|
||||
data={
|
||||
'hostname': hostname,
|
||||
'reason': reason,
|
||||
},
|
||||
source='systemd',
|
||||
entity='node',
|
||||
entity_id='',
|
||||
)
|
||||
|
||||
return jsonify({'success': True, 'event_type': event_type}), 200
|
||||
except Exception as e:
|
||||
return jsonify({'error': 'internal_error', 'detail': str(e)}), 500
|
||||
|
||||
@@ -34,13 +34,18 @@ class _SharedState:
|
||||
Used to coordinate behavior when host-level events affect VM/CT events:
|
||||
- Suppress vm_stop/ct_stop during host shutdown (they're expected)
|
||||
- Aggregate vm_start/ct_start during startup into single message
|
||||
|
||||
Two separate grace periods:
|
||||
- startup_vm_grace: Time to aggregate VM/CT starts (shorter, 2 min)
|
||||
- startup_health_grace: Time to suppress transient health errors (longer, 3 min)
|
||||
"""
|
||||
def __init__(self):
|
||||
self._lock = threading.Lock()
|
||||
self._shutdown_time: float = 0 # timestamp when shutdown was detected
|
||||
self._shutdown_grace = 120 # suppress VM/CT stops for 2 minutes after shutdown detected
|
||||
self._startup_time: float = time.time() # when module was loaded (service start)
|
||||
self._startup_grace = 300 # aggregate VM/CT starts for 5 minutes after startup
|
||||
self._startup_vm_grace = 120 # aggregate VM/CT starts for 2 minutes after startup
|
||||
self._startup_health_grace = 180 # suppress health warnings for 3 minutes after startup
|
||||
self._startup_vms: list = [] # [(vmid, vmname, 'vm'|'ct'), ...]
|
||||
self._startup_aggregated = False # have we already sent the aggregated message?
|
||||
|
||||
@@ -57,9 +62,18 @@ class _SharedState:
|
||||
return (time.time() - self._shutdown_time) < self._shutdown_grace
|
||||
|
||||
def is_startup_period(self) -> bool:
|
||||
"""Check if we're within the startup aggregation period."""
|
||||
"""Check if we're within the startup VM aggregation period (2 min)."""
|
||||
with self._lock:
|
||||
return (time.time() - self._startup_time) < self._startup_grace
|
||||
return (time.time() - self._startup_time) < self._startup_vm_grace
|
||||
|
||||
def is_startup_health_grace(self) -> bool:
|
||||
"""Check if we're within the startup health grace period (3 min).
|
||||
|
||||
Used by PollingCollector to suppress transient health warnings
|
||||
(QMP timeout, storage not ready, etc.) during system boot.
|
||||
"""
|
||||
with self._lock:
|
||||
return (time.time() - self._startup_time) < self._startup_health_grace
|
||||
|
||||
def add_startup_vm(self, vmid: str, vmname: str, vm_type: str):
|
||||
"""Record a VM/CT start during startup period for later aggregation."""
|
||||
@@ -1769,7 +1783,6 @@ class PollingCollector:
|
||||
# Dict[error_key, dict(category, severity, reason, first_seen, error_key)]
|
||||
self._known_errors: Dict[str, dict] = {}
|
||||
self._first_poll_done = False
|
||||
self._startup_time = time.time() # Track when service started
|
||||
|
||||
def start(self):
|
||||
if self._running:
|
||||
@@ -1792,10 +1805,8 @@ class PollingCollector:
|
||||
|
||||
# ── Main loop ──────────────────────────────────────────────
|
||||
|
||||
# Startup grace period: ignore transient errors from certain categories
|
||||
# during the first N seconds after service start. Remote services like
|
||||
# PBS storage, VMs with qemu-guest-agent, etc. may take time to connect.
|
||||
STARTUP_GRACE_PERIOD = 180 # 3 minutes
|
||||
# Categories where transient errors are suppressed during startup grace period.
|
||||
# PBS storage, NFS mounts, VMs with qemu-guest-agent need time after boot.
|
||||
STARTUP_GRACE_CATEGORIES = {'storage', 'vms', 'network', 'pve_services'}
|
||||
|
||||
def _poll_loop(self):
|
||||
@@ -1907,8 +1918,8 @@ class PollingCollector:
|
||||
# Startup grace period: ignore transient errors from categories that
|
||||
# typically need time to stabilize after boot (storage, VMs, network).
|
||||
# PBS storage, NFS mounts, VMs with qemu-guest-agent need time to connect.
|
||||
time_since_startup = now - self._startup_time
|
||||
if time_since_startup < self.STARTUP_GRACE_PERIOD:
|
||||
# Uses the shared state so grace period is consistent across all watchers.
|
||||
if _shared_state.is_startup_health_grace():
|
||||
if category in self.STARTUP_GRACE_CATEGORIES:
|
||||
# Still within grace period for this category - skip notification
|
||||
continue
|
||||
|
||||
@@ -1015,6 +1015,37 @@ class NotificationManager:
|
||||
|
||||
# ─── Public API (used by Flask routes and CLI) ──────────────
|
||||
|
||||
def emit_event(self, event_type: str, severity: str, data: Dict,
|
||||
source: str = 'api', entity: str = 'node', entity_id: str = '') -> Dict[str, Any]:
|
||||
"""Emit an event through the notification system.
|
||||
|
||||
This creates a NotificationEvent and processes it through the normal pipeline,
|
||||
including toggle checks, template rendering, and cooldown.
|
||||
|
||||
Used by internal endpoints like the shutdown notification hook.
|
||||
|
||||
Args:
|
||||
event_type: Type of event (must match TEMPLATES key)
|
||||
severity: INFO, WARNING, CRITICAL
|
||||
data: Event data for template rendering
|
||||
source: Origin of event
|
||||
entity: Entity type (node, vm, ct, storage, etc.)
|
||||
entity_id: Entity identifier
|
||||
"""
|
||||
from notification_events import NotificationEvent
|
||||
|
||||
event = NotificationEvent(
|
||||
event_type=event_type,
|
||||
severity=severity,
|
||||
data=data,
|
||||
source=source,
|
||||
entity=entity,
|
||||
entity_id=entity_id,
|
||||
)
|
||||
|
||||
# Process the event through the normal pipeline
|
||||
return self._process_single_event(event)
|
||||
|
||||
def send_notification(self, event_type: str, severity: str,
|
||||
title: str, message: str,
|
||||
data: Optional[Dict] = None,
|
||||
|
||||
47
AppImage/scripts/shutdown-notify.sh
Normal file
47
AppImage/scripts/shutdown-notify.sh
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
# ProxMenux Monitor - Shutdown Notification Script
|
||||
# This script is called by systemd ExecStop before the service terminates.
|
||||
# It sends a shutdown/reboot notification via the running Flask server.
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
CONFIG_DIR="/var/lib/proxmenux"
|
||||
CONFIG_FILE="$CONFIG_DIR/config.json"
|
||||
PORT="${PORT:-5000}"
|
||||
|
||||
# Determine if this is a reboot or shutdown
|
||||
# Check for systemd targets or runlevel
|
||||
is_reboot=false
|
||||
if systemctl is-active --quiet reboot.target 2>/dev/null; then
|
||||
is_reboot=true
|
||||
elif [ -f /run/systemd/shutdown/scheduled ]; then
|
||||
if grep -q "reboot" /run/systemd/shutdown/scheduled 2>/dev/null; then
|
||||
is_reboot=true
|
||||
fi
|
||||
fi
|
||||
|
||||
# Build the event type and message
|
||||
if [ "$is_reboot" = true ]; then
|
||||
event_type="system_reboot"
|
||||
reason="The system is rebooting."
|
||||
else
|
||||
event_type="system_shutdown"
|
||||
reason="The system is shutting down."
|
||||
fi
|
||||
|
||||
hostname=$(hostname)
|
||||
|
||||
# Try to send notification via internal API endpoint
|
||||
# The Flask server may still be running at this point
|
||||
curl -s -X POST "http://127.0.0.1:$PORT/api/internal/shutdown-event" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"event_type\": \"$event_type\", \"hostname\": \"$hostname\", \"reason\": \"$reason\"}" \
|
||||
--max-time 5 2>/dev/null || true
|
||||
|
||||
# Give the notification a moment to be sent
|
||||
sleep 2
|
||||
|
||||
# Now terminate the Flask process
|
||||
# Find the main process and send SIGTERM
|
||||
pkill -TERM -f "proxmenux-monitor" 2>/dev/null || true
|
||||
|
||||
exit 0
|
||||
Reference in New Issue
Block a user