From 68872d0e060efd5f650e828057f9c280f0610def Mon Sep 17 00:00:00 2001 From: MacRimi Date: Wed, 25 Mar 2026 20:12:08 +0100 Subject: [PATCH] Update notification service --- AppImage/scripts/build_appimage.sh | 2 - AppImage/scripts/health_persistence.py | 2 +- AppImage/scripts/notification_events.py | 8 +- AppImage/scripts/notification_templates.py | 2 +- AppImage/scripts/test_all_notifications.sh | 481 ------------ AppImage/scripts/test_disk_observation.py | 131 ---- AppImage/scripts/test_real_events.sh | 732 ------------------ .../scripts => scripts}/shutdown-notify.sh | 0 8 files changed, 6 insertions(+), 1352 deletions(-) delete mode 100644 AppImage/scripts/test_all_notifications.sh delete mode 100644 AppImage/scripts/test_disk_observation.py delete mode 100644 AppImage/scripts/test_real_events.sh rename {AppImage/scripts => scripts}/shutdown-notify.sh (100%) diff --git a/AppImage/scripts/build_appimage.sh b/AppImage/scripts/build_appimage.sh index 585a472c..0f540547 100644 --- a/AppImage/scripts/build_appimage.sh +++ b/AppImage/scripts/build_appimage.sh @@ -99,8 +99,6 @@ cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || cp "$SCRIPT_DIR/oci_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ oci_manager.py not found" cp "$SCRIPT_DIR/flask_oci_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_oci_routes.py not found" cp "$SCRIPT_DIR/oci/description_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ description_templates.py not found" -cp "$SCRIPT_DIR/shutdown-notify.sh" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ shutdown-notify.sh not found" -chmod +x "$APP_DIR/usr/bin/shutdown-notify.sh" 2>/dev/null || true # Copy AI providers module for notification enhancement echo "📋 Copying AI providers module..." diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 7758f171..70a81170 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -1325,7 +1325,7 @@ class HealthPersistence: print(f"[HealthPersistence] Error recording UNKNOWN persistent: {e}") - # ───────────────────────────────────────────────────────────────�� + # ──────────────────────────────────────────────────────────────── # Disk Observations API # ──────────────────────────────────────────────────────────────── diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index 5bcd2aa6..191e2ad2 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -37,7 +37,7 @@ class _SharedState: Two separate grace periods: - startup_vm_grace: Time to aggregate VM/CT starts (shorter, 2 min) - - startup_health_grace: Time to suppress transient health errors (longer, 3 min) + - startup_health_grace: Time to suppress transient health errors (longer, 5 min) """ def __init__(self): self._lock = threading.Lock() @@ -45,7 +45,7 @@ class _SharedState: self._shutdown_grace = 120 # suppress VM/CT stops for 2 minutes after shutdown detected self._startup_time: float = time.time() # when module was loaded (service start) self._startup_vm_grace = 120 # aggregate VM/CT starts for 2 minutes after startup - self._startup_health_grace = 180 # suppress health warnings for 3 minutes after startup + self._startup_health_grace = 300 # suppress health warnings for 5 minutes after startup self._startup_vms: list = [] # [(vmid, vmname, 'vm'|'ct'), ...] self._startup_aggregated = False # have we already sent the aggregated message? @@ -67,10 +67,10 @@ class _SharedState: return (time.time() - self._startup_time) < self._startup_vm_grace def is_startup_health_grace(self) -> bool: - """Check if we're within the startup health grace period (3 min). + """Check if we're within the startup health grace period (5 min). Used by PollingCollector to suppress transient health warnings - (QMP timeout, storage not ready, etc.) during system boot. + (QMP timeout, storage not ready, high latency, etc.) during system boot. """ with self._lock: return (time.time() - self._startup_time) < self._startup_health_grace diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py index 897556f0..b1a1fd93 100644 --- a/AppImage/scripts/notification_templates.py +++ b/AppImage/scripts/notification_templates.py @@ -1064,7 +1064,7 @@ def get_default_enabled_events() -> Dict[str, bool]: } -# ─── Emoji Enrichment (per-channel opt-in) ────────────────────── +# ──�� Emoji Enrichment (per-channel opt-in) ────────────────────── # Category-level header icons CATEGORY_EMOJI = { diff --git a/AppImage/scripts/test_all_notifications.sh b/AppImage/scripts/test_all_notifications.sh deleted file mode 100644 index 725ebc5d..00000000 --- a/AppImage/scripts/test_all_notifications.sh +++ /dev/null @@ -1,481 +0,0 @@ -#!/bin/bash -# ============================================================================ -# ProxMenux Notification System - Complete Test Suite -# ============================================================================ -# -# Usage: -# chmod +x test_all_notifications.sh -# ./test_all_notifications.sh # Run ALL tests (with 3s pause between) -# ./test_all_notifications.sh system # Run only System category -# ./test_all_notifications.sh vm_ct # Run only VM/CT category -# ./test_all_notifications.sh backup # Run only Backup category -# ./test_all_notifications.sh resources # Run only Resources category -# ./test_all_notifications.sh storage # Run only Storage category -# ./test_all_notifications.sh network # Run only Network category -# ./test_all_notifications.sh security # Run only Security category -# ./test_all_notifications.sh cluster # Run only Cluster category -# ./test_all_notifications.sh burst # Run only Burst aggregation tests -# -# Each test sends a simulated webhook to the local notification endpoint. -# Check your Telegram/Gotify/Discord/Email for the notifications. -# ============================================================================ - -API="http://127.0.0.1:8008/api/notifications/webhook" -PAUSE=3 # seconds between tests - -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -CYAN='\033[0;36m' -NC='\033[0m' # No Color -BOLD='\033[1m' - -test_count=0 -pass_count=0 -fail_count=0 - -send_test() { - local name="$1" - local payload="$2" - test_count=$((test_count + 1)) - - echo -e "${CYAN} [$test_count] ${BOLD}$name${NC}" - - response=$(curl -s -w "\n%{http_code}" -X POST "$API" \ - -H "Content-Type: application/json" \ - -d "$payload" 2>&1) - - http_code=$(echo "$response" | tail -1) - body=$(echo "$response" | head -n -1) - - if [ "$http_code" = "200" ] || [ "$http_code" = "202" ]; then - echo -e " ${GREEN}HTTP $http_code${NC} - $body" - pass_count=$((pass_count + 1)) - else - echo -e " ${RED}HTTP $http_code${NC} - $body" - fail_count=$((fail_count + 1)) - fi - - sleep "$PAUSE" -} - -# ============================================================================ -# SYSTEM CATEGORY (group: system) -# ============================================================================ -test_system() { - echo "" - echo -e "${YELLOW}========================================${NC}" - echo -e "${YELLOW} SYSTEM - Startup, shutdown, kernel${NC}" - echo -e "${YELLOW}========================================${NC}" - echo "" - - # 1. state_change (disabled by default -- test to verify it does NOT arrive) - send_test "state_change (should NOT arrive - disabled by default)" \ - '{"type":"state_change","component":"health","severity":"warning","title":"overall changed to WARNING","body":"overall status changed from OK to WARNING."}' - - # 2. new_error - send_test "new_error" \ - '{"type":"new_error","component":"health","severity":"warning","title":"New WARNING - cpu","body":"CPU usage exceeds 90% for more than 5 minutes","category":"cpu"}' - - # 3. error_resolved - send_test "error_resolved" \ - '{"type":"error_resolved","component":"health","severity":"info","title":"Resolved - cpu","body":"CPU usage returned to normal.\nDuration: 15 minutes","category":"cpu","duration":"15 minutes"}' - - # 4. error_escalated - send_test "error_escalated" \ - '{"type":"error_escalated","component":"health","severity":"critical","title":"Escalated to CRITICAL - memory","body":"Memory usage exceeded 95% and swap is active","category":"memory"}' - - # 5. system_shutdown - send_test "system_shutdown" \ - '{"type":"system_shutdown","component":"system","severity":"warning","title":"System shutting down","body":"The system is shutting down.\nUser initiated shutdown."}' - - # 6. system_reboot - send_test "system_reboot" \ - '{"type":"system_reboot","component":"system","severity":"warning","title":"System rebooting","body":"The system is rebooting.\nKernel update applied."}' - - # 7. system_problem - send_test "system_problem" \ - '{"type":"system_problem","component":"system","severity":"critical","title":"System problem detected","body":"Kernel panic: Attempted to kill init! exitcode=0x00000009"}' - - # 8. service_fail - send_test "service_fail" \ - '{"type":"service_fail","component":"systemd","severity":"warning","title":"Service failed - pvedaemon","body":"Service pvedaemon has failed.\nUnit pvedaemon.service entered failed state.","service_name":"pvedaemon"}' - - # 9. update_available (legacy, superseded by update_summary) - send_test "update_available" \ - '{"type":"update_available","component":"apt","severity":"info","title":"Updates available","body":"Total updates: 12\nSecurity: 3\nProxmox: 5\nKernel: 1\nImportant: pve-manager (8.3.5 -> 8.4.1)","total_count":"12","security_count":"3","pve_count":"5","kernel_count":"1","important_list":"pve-manager (8.3.5 -> 8.4.1)"}' - - # 10. update_complete - send_test "update_complete" \ - '{"type":"update_complete","component":"apt","severity":"info","title":"Update completed","body":"12 packages updated successfully."}' - - # 11. unknown_persistent - send_test "unknown_persistent" \ - '{"type":"unknown_persistent","component":"health","severity":"warning","title":"Check unavailable - temperature","body":"Health check for temperature has been unavailable for 3+ cycles.\nSensor not responding.","category":"temperature"}' - - # 12. health_persistent - send_test "health_persistent" \ - '{"type":"health_persistent","component":"health","severity":"warning","title":"3 active health issue(s)","body":"The following health issues remain active:\n- CPU at 92%\n- Memory at 88%\n- Disk /dev/sda at 94%\n\nThis digest is sent once every 24 hours while issues persist.","count":"3"}' - - # 13. health_issue_new - send_test "health_issue_new" \ - '{"type":"health_issue_new","component":"health","severity":"warning","title":"New health issue - disk","body":"New WARNING issue detected:\nDisk /dev/sda usage at 94%","category":"disk"}' - - # 14. health_issue_resolved - send_test "health_issue_resolved" \ - '{"type":"health_issue_resolved","component":"health","severity":"info","title":"Resolved - disk","body":"disk issue has been resolved.\nDisk usage dropped to 72%.\nDuration: 3 hours","category":"disk","duration":"3 hours"}' - - # 15. update_summary - send_test "update_summary" \ - '{"type":"update_summary","component":"apt","severity":"info","title":"Updates available","body":"Total updates: 70\nSecurity updates: 9\nProxmox-related updates: 24\nKernel updates: 1\nImportant packages: pve-manager (8.3.5 -> 8.4.1), proxmox-ve (8.3.0 -> 8.4.0), qemu-server (8.3.8 -> 8.4.2)","total_count":"70","security_count":"9","pve_count":"24","kernel_count":"1","important_list":"pve-manager (8.3.5 -> 8.4.1), proxmox-ve (8.3.0 -> 8.4.0), qemu-server (8.3.8 -> 8.4.2)"}' - - # 16. pve_update - send_test "pve_update" \ - '{"type":"pve_update","component":"apt","severity":"info","title":"Proxmox VE 8.4.1 available","body":"Proxmox VE 8.3.5 -> 8.4.1\npve-manager 8.3.5 -> 8.4.1","current_version":"8.3.5","new_version":"8.4.1","version":"8.4.1","details":"pve-manager 8.3.5 -> 8.4.1"}' -} - -# ============================================================================ -# VM / CT CATEGORY (group: vm_ct) -# ============================================================================ -test_vm_ct() { - echo "" - echo -e "${YELLOW}========================================${NC}" - echo -e "${YELLOW} VM / CT - Start, stop, crash, migration${NC}" - echo -e "${YELLOW}========================================${NC}" - echo "" - - # 1. vm_start - send_test "vm_start" \ - '{"type":"vm_start","component":"qemu","severity":"info","title":"VM 100 started","body":"ubuntu-server (100) has been started.","vmid":"100","vmname":"ubuntu-server"}' - - # 2. vm_stop - send_test "vm_stop" \ - '{"type":"vm_stop","component":"qemu","severity":"info","title":"VM 100 stopped","body":"ubuntu-server (100) has been stopped.","vmid":"100","vmname":"ubuntu-server"}' - - # 3. vm_shutdown - send_test "vm_shutdown" \ - '{"type":"vm_shutdown","component":"qemu","severity":"info","title":"VM 100 shutdown","body":"ubuntu-server (100) has been shut down.","vmid":"100","vmname":"ubuntu-server"}' - - # 4. vm_fail - send_test "vm_fail" \ - '{"type":"vm_fail","component":"qemu","severity":"critical","title":"VM 100 FAILED","body":"ubuntu-server (100) has failed.\nKVM: internal error: unexpected exit to hypervisor","vmid":"100","vmname":"ubuntu-server","reason":"KVM: internal error: unexpected exit to hypervisor"}' - - # 5. vm_restart - send_test "vm_restart" \ - '{"type":"vm_restart","component":"qemu","severity":"info","title":"VM 100 restarted","body":"ubuntu-server (100) has been restarted.","vmid":"100","vmname":"ubuntu-server"}' - - # 6. ct_start - send_test "ct_start" \ - '{"type":"ct_start","component":"lxc","severity":"info","title":"CT 200 started","body":"nginx-proxy (200) has been started.","vmid":"200","vmname":"nginx-proxy"}' - - # 7. ct_stop - send_test "ct_stop" \ - '{"type":"ct_stop","component":"lxc","severity":"info","title":"CT 200 stopped","body":"nginx-proxy (200) has been stopped.","vmid":"200","vmname":"nginx-proxy"}' - - # 8. ct_fail - send_test "ct_fail" \ - '{"type":"ct_fail","component":"lxc","severity":"critical","title":"CT 200 FAILED","body":"nginx-proxy (200) has failed.\nContainer exited with error code 137","vmid":"200","vmname":"nginx-proxy","reason":"Container exited with error code 137"}' - - # 9. migration_start - send_test "migration_start" \ - '{"type":"migration_start","component":"qemu","severity":"info","title":"Migration started - 100","body":"ubuntu-server (100) migration to pve-node2 started.","vmid":"100","vmname":"ubuntu-server","target_node":"pve-node2"}' - - # 10. migration_complete - send_test "migration_complete" \ - '{"type":"migration_complete","component":"qemu","severity":"info","title":"Migration complete - 100","body":"ubuntu-server (100) migrated successfully to pve-node2.","vmid":"100","vmname":"ubuntu-server","target_node":"pve-node2"}' - - # 11. migration_fail - send_test "migration_fail" \ - '{"type":"migration_fail","component":"qemu","severity":"critical","title":"Migration FAILED - 100","body":"ubuntu-server (100) migration to pve-node2 failed.\nNetwork timeout during memory transfer","vmid":"100","vmname":"ubuntu-server","target_node":"pve-node2","reason":"Network timeout during memory transfer"}' - - # 12. replication_fail - send_test "replication_fail" \ - '{"type":"replication_fail","component":"replication","severity":"critical","title":"Replication FAILED - 100","body":"Replication of ubuntu-server (100) has failed.\nTarget storage unreachable","vmid":"100","vmname":"ubuntu-server","reason":"Target storage unreachable"}' - - # 13. replication_complete - send_test "replication_complete" \ - '{"type":"replication_complete","component":"replication","severity":"info","title":"Replication complete - 100","body":"Replication of ubuntu-server (100) completed successfully.","vmid":"100","vmname":"ubuntu-server"}' -} - -# ============================================================================ -# BACKUP CATEGORY (group: backup) -# ============================================================================ -test_backup() { - echo "" - echo -e "${YELLOW}========================================${NC}" - echo -e "${YELLOW} BACKUPS - Backup start, complete, fail${NC}" - echo -e "${YELLOW}========================================${NC}" - echo "" - - # 1. backup_start - send_test "backup_start" \ - '{"type":"backup_start","component":"vzdump","severity":"info","title":"Backup started - 100","body":"Backup of ubuntu-server (100) has started.","vmid":"100","vmname":"ubuntu-server"}' - - # 2. backup_complete - send_test "backup_complete" \ - '{"type":"backup_complete","component":"vzdump","severity":"info","title":"Backup complete - 100","body":"Backup of ubuntu-server (100) completed successfully.\nSize: 12.4 GB","vmid":"100","vmname":"ubuntu-server","size":"12.4 GB"}' - - # 3. backup_fail - send_test "backup_fail" \ - '{"type":"backup_fail","component":"vzdump","severity":"critical","title":"Backup FAILED - 100","body":"Backup of ubuntu-server (100) has failed.\nStorage local-lvm is full","vmid":"100","vmname":"ubuntu-server","reason":"Storage local-lvm is full"}' - - # 4. snapshot_complete - send_test "snapshot_complete" \ - '{"type":"snapshot_complete","component":"qemu","severity":"info","title":"Snapshot created - 100","body":"Snapshot of ubuntu-server (100) created: pre-upgrade-2026","vmid":"100","vmname":"ubuntu-server","snapshot_name":"pre-upgrade-2026"}' - - # 5. snapshot_fail - send_test "snapshot_fail" \ - '{"type":"snapshot_fail","component":"qemu","severity":"critical","title":"Snapshot FAILED - 100","body":"Snapshot of ubuntu-server (100) failed.\nInsufficient space on storage","vmid":"100","vmname":"ubuntu-server","reason":"Insufficient space on storage"}' -} - -# ============================================================================ -# RESOURCES CATEGORY (group: resources) -# ============================================================================ -test_resources() { - echo "" - echo -e "${YELLOW}========================================${NC}" - echo -e "${YELLOW} RESOURCES - CPU, memory, temperature${NC}" - echo -e "${YELLOW}========================================${NC}" - echo "" - - # 1. cpu_high - send_test "cpu_high" \ - '{"type":"cpu_high","component":"health","severity":"warning","title":"High CPU usage (94%)","body":"CPU usage is at 94% on 16 cores.\nTop process: kvm (VM 100)","value":"94","cores":"16","details":"Top process: kvm (VM 100)"}' - - # 2. ram_high - send_test "ram_high" \ - '{"type":"ram_high","component":"health","severity":"warning","title":"High memory usage (91%)","body":"Memory usage: 58.2 GB / 64 GB (91%).\n4 VMs running, swap at 2.1 GB","value":"91","used":"58.2 GB","total":"64 GB","details":"4 VMs running, swap at 2.1 GB"}' - - # 3. temp_high - send_test "temp_high" \ - '{"type":"temp_high","component":"health","severity":"critical","title":"High temperature (89C)","body":"CPU temperature: 89C (threshold: 80C).\nCheck cooling system immediately","value":"89","threshold":"80","details":"Check cooling system immediately"}' - - # 4. load_high - send_test "load_high" \ - '{"type":"load_high","component":"health","severity":"warning","title":"High system load (24.5)","body":"System load average: 24.5 on 16 cores.\nI/O wait: 35%","value":"24.5","cores":"16","details":"I/O wait: 35%"}' -} - -# ============================================================================ -# STORAGE CATEGORY (group: storage) -# ============================================================================ -test_storage() { - echo "" - echo -e "${YELLOW}========================================${NC}" - echo -e "${YELLOW} STORAGE - Disk space, I/O errors, SMART${NC}" - echo -e "${YELLOW}========================================${NC}" - echo "" - - # 1. disk_space_low - send_test "disk_space_low" \ - '{"type":"disk_space_low","component":"storage","severity":"warning","title":"Low disk space on /var","body":"/var: 93% used (4.2 GB available).","mount":"/var","used":"93","available":"4.2 GB"}' - - # 2. disk_io_error - send_test "disk_io_error" \ - '{"type":"disk_io_error","component":"smart","severity":"critical","title":"Disk I/O error","body":"I/O error detected on /dev/sdb.\nSMART error: Current Pending Sector Count = 8","device":"/dev/sdb","reason":"SMART error: Current Pending Sector Count = 8"}' - - # 3. burst_disk_io - send_test "burst_disk_io" \ - '{"type":"burst_disk_io","component":"storage","severity":"critical","title":"5 disk I/O errors on /dev/sdb, /dev/sdc","body":"5 I/O errors detected in 60s.\nDevices: /dev/sdb, /dev/sdc","count":"5","window":"60s","entity_list":"/dev/sdb, /dev/sdc"}' -} - -# ============================================================================ -# NETWORK CATEGORY (group: network) -# ============================================================================ -test_network() { - echo "" - echo -e "${YELLOW}========================================${NC}" - echo -e "${YELLOW} NETWORK - Connectivity, bond, latency${NC}" - echo -e "${YELLOW}========================================${NC}" - echo "" - - # 1. network_down - send_test "network_down" \ - '{"type":"network_down","component":"network","severity":"critical","title":"Network connectivity lost","body":"Network connectivity check failed.\nGateway 192.168.1.1 unreachable. Bond vmbr0 degraded.","reason":"Gateway 192.168.1.1 unreachable. Bond vmbr0 degraded."}' - - # 2. network_latency - send_test "network_latency" \ - '{"type":"network_latency","component":"network","severity":"warning","title":"High network latency (450ms)","body":"Latency to gateway: 450ms (threshold: 100ms).","value":"450","threshold":"100"}' -} - -# ============================================================================ -# SECURITY CATEGORY (group: security) -# ============================================================================ -test_security() { - echo "" - echo -e "${YELLOW}========================================${NC}" - echo -e "${YELLOW} SECURITY - Auth failures, fail2ban, firewall${NC}" - echo -e "${YELLOW}========================================${NC}" - echo "" - - # 1. auth_fail - send_test "auth_fail" \ - '{"type":"auth_fail","component":"auth","severity":"warning","title":"Authentication failure","body":"Failed login attempt from 203.0.113.42.\nUser: root\nService: sshd","source_ip":"203.0.113.42","username":"root","service":"sshd"}' - - # 2. ip_block - send_test "ip_block" \ - '{"type":"ip_block","component":"security","severity":"info","title":"IP blocked by Fail2Ban","body":"IP 203.0.113.42 has been banned.\nJail: sshd\nFailures: 5","source_ip":"203.0.113.42","jail":"sshd","failures":"5"}' - - # 3. firewall_issue - send_test "firewall_issue" \ - '{"type":"firewall_issue","component":"firewall","severity":"warning","title":"Firewall issue detected","body":"Firewall rule conflict detected on vmbr0.\nRule 15 overlaps with rule 23, potentially blocking cluster traffic.","reason":"Firewall rule conflict detected on vmbr0. Rule 15 overlaps with rule 23."}' - - # 4. user_permission_change - send_test "user_permission_change" \ - '{"type":"user_permission_change","component":"auth","severity":"info","title":"User permission changed","body":"User: admin@pam\nChange: Added PVEAdmin role on /vms/100","username":"admin@pam","change_details":"Added PVEAdmin role on /vms/100"}' - - # 5. burst_auth_fail - send_test "burst_auth_fail" \ - '{"type":"burst_auth_fail","component":"security","severity":"warning","title":"8 auth failures in 2m","body":"8 authentication failures detected in 2m.\nSources: 203.0.113.42, 198.51.100.7, 192.0.2.15","count":"8","window":"2m","entity_list":"203.0.113.42, 198.51.100.7, 192.0.2.15"}' - - # 6. burst_ip_block - send_test "burst_ip_block" \ - '{"type":"burst_ip_block","component":"security","severity":"info","title":"Fail2Ban banned 4 IPs in 5m","body":"4 IPs banned by Fail2Ban in 5m.\nIPs: 203.0.113.42, 198.51.100.7, 192.0.2.15, 10.0.0.99","count":"4","window":"5m","entity_list":"203.0.113.42, 198.51.100.7, 192.0.2.15, 10.0.0.99"}' -} - -# ============================================================================ -# CLUSTER CATEGORY (group: cluster) -# ============================================================================ -test_cluster() { - echo "" - echo -e "${YELLOW}========================================${NC}" - echo -e "${YELLOW} CLUSTER - Quorum, split-brain, HA fencing${NC}" - echo -e "${YELLOW}========================================${NC}" - echo "" - - # 1. split_brain - send_test "split_brain" \ - '{"type":"split_brain","component":"cluster","severity":"critical","title":"SPLIT-BRAIN detected","body":"Cluster split-brain condition detected.\nQuorum status: No quorum - 1/3 nodes visible","quorum":"No quorum - 1/3 nodes visible"}' - - # 2. node_disconnect - send_test "node_disconnect" \ - '{"type":"node_disconnect","component":"corosync","severity":"critical","title":"Node disconnected","body":"Node pve-node3 has disconnected from the cluster.","node_name":"pve-node3"}' - - # 3. node_reconnect - send_test "node_reconnect" \ - '{"type":"node_reconnect","component":"corosync","severity":"info","title":"Node reconnected","body":"Node pve-node3 has reconnected to the cluster.","node_name":"pve-node3"}' - - # 4. burst_cluster - send_test "burst_cluster" \ - '{"type":"burst_cluster","component":"cluster","severity":"critical","title":"Cluster flapping detected (6 changes)","body":"Cluster state changed 6 times in 5m.\nNodes: pve-node2, pve-node3","count":"6","window":"5m","entity_list":"pve-node2, pve-node3"}' -} - -# ============================================================================ -# BURST AGGREGATION TESTS (send rapid events to trigger burst detection) -# ============================================================================ -test_burst() { - echo "" - echo -e "${YELLOW}========================================${NC}" - echo -e "${YELLOW} BURST - Rapid events to trigger aggregation${NC}" - echo -e "${YELLOW}========================================${NC}" - echo "" - - echo -e "${BLUE} Sending 5 rapid auth_fail events (should trigger burst_auth_fail)...${NC}" - for i in $(seq 1 5); do - curl -s -X POST "$API" \ - -H "Content-Type: application/json" \ - -d "{\"type\":\"auth_fail\",\"component\":\"auth\",\"severity\":\"warning\",\"title\":\"Auth fail from 10.0.0.$i\",\"body\":\"Failed login from 10.0.0.$i\",\"source_ip\":\"10.0.0.$i\"}" > /dev/null - echo -e " ${CYAN}Sent auth_fail $i/5${NC}" - sleep 0.5 - done - echo -e " ${GREEN}Done. Wait ~10s for burst aggregation...${NC}" - sleep 10 - - echo "" - echo -e "${BLUE} Sending 4 rapid disk_io_error events (should trigger burst_disk_io)...${NC}" - for i in $(seq 1 4); do - curl -s -X POST "$API" \ - -H "Content-Type: application/json" \ - -d "{\"type\":\"disk_io_error\",\"component\":\"smart\",\"severity\":\"critical\",\"title\":\"I/O error on /dev/sd${i}\",\"body\":\"Error on device\",\"device\":\"/dev/sd${i}\"}" > /dev/null - echo -e " ${CYAN}Sent disk_io_error $i/4${NC}" - sleep 0.5 - done - echo -e " ${GREEN}Done. Wait ~10s for burst aggregation...${NC}" - sleep 10 - - echo "" - echo -e "${BLUE} Sending 3 rapid node_disconnect events (should trigger burst_cluster)...${NC}" - for i in $(seq 1 3); do - curl -s -X POST "$API" \ - -H "Content-Type: application/json" \ - -d "{\"type\":\"node_disconnect\",\"component\":\"corosync\",\"severity\":\"critical\",\"title\":\"Node pve-node$i disconnected\",\"body\":\"Node lost\",\"node_name\":\"pve-node$i\"}" > /dev/null - echo -e " ${CYAN}Sent node_disconnect $i/3${NC}" - sleep 0.5 - done - echo -e " ${GREEN}Done. Wait ~10s for burst aggregation...${NC}" - sleep 10 -} - -# ============================================================================ -# MAIN -# ============================================================================ - -echo "" -echo -e "${BOLD}============================================================${NC}" -echo -e "${BOLD} ProxMenux Notification System - Complete Test Suite${NC}" -echo -e "${BOLD}============================================================${NC}" -echo -e " API: $API" -echo -e " Pause: ${PAUSE}s between tests" -echo "" - -# Check that the service is reachable -status=$(curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:8008/api/notifications/status" 2>/dev/null) -if [ "$status" != "200" ]; then - echo -e "${RED}ERROR: Notification service not reachable (HTTP $status)${NC}" - echo -e " Make sure ProxMenux Monitor is running." - exit 1 -fi -echo -e "${GREEN}Service is reachable.${NC}" - -# Parse argument -category="${1:-all}" - -case "$category" in - system) test_system ;; - vm_ct) test_vm_ct ;; - backup) test_backup ;; - resources) test_resources ;; - storage) test_storage ;; - network) test_network ;; - security) test_security ;; - cluster) test_cluster ;; - burst) test_burst ;; - all) - test_system - test_vm_ct - test_backup - test_resources - test_storage - test_network - test_security - test_cluster - test_burst - ;; - *) - echo -e "${RED}Unknown category: $category${NC}" - echo "Usage: $0 [system|vm_ct|backup|resources|storage|network|security|cluster|burst|all]" - exit 1 - ;; -esac - -# ============================================================================ -# SUMMARY -# ============================================================================ -echo "" -echo -e "${BOLD}============================================================${NC}" -echo -e "${BOLD} SUMMARY${NC}" -echo -e "${BOLD}============================================================${NC}" -echo -e " Total tests: $test_count" -echo -e " ${GREEN}Accepted:${NC} $pass_count" -echo -e " ${RED}Rejected:${NC} $fail_count" -echo "" -echo -e " Check your notification channels for the messages." -echo -e " Note: Some events may be filtered by your current settings" -echo -e " (severity filter, disabled categories, disabled individual events)." -echo "" -echo -e " To check notification history (all events):" -echo -e " ${CYAN}curl -s 'http://127.0.0.1:8008/api/notifications/history?limit=200' | python3 -m json.tool${NC}" -echo "" -echo -e " To count events by type:" -echo -e " ${CYAN}curl -s 'http://127.0.0.1:8008/api/notifications/history?limit=200' | python3 -c \"import sys,json; h=json.load(sys.stdin)['history']; [print(f' {t}: {c}') for t,c in sorted(dict((e['event_type'],sum(1 for x in h if x['event_type']==e['event_type'])) for e in h).items())]\"${NC} -echo "" diff --git a/AppImage/scripts/test_disk_observation.py b/AppImage/scripts/test_disk_observation.py deleted file mode 100644 index 9dfdfa9e..00000000 --- a/AppImage/scripts/test_disk_observation.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script to simulate a disk error and verify observation recording. -Usage: python3 test_disk_observation.py [device_name] [error_type] - -Examples: - python3 test_disk_observation.py sdh io_error - python3 test_disk_observation.py sdh smart_error - python3 test_disk_observation.py sdh fs_error -""" - -import sys -import os - -# Add possible module locations to path -script_dir = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(0, script_dir) -sys.path.insert(0, '/usr/local/share/proxmenux') -sys.path.insert(0, '/tmp/.mount_ProxMeztyU13/usr/bin') # AppImage mount point - -# Try to find the module -for path in sys.path: - if os.path.exists(os.path.join(path, 'health_persistence.py')): - print(f"[INFO] Found health_persistence.py in: {path}") - break - -from health_persistence import HealthPersistence -from datetime import datetime - -def main(): - device_name = sys.argv[1] if len(sys.argv) > 1 else 'sdh' - error_type = sys.argv[2] if len(sys.argv) > 2 else 'io_error' - - # Known serial for sdh (WDC 2TB) - serial_map = { - 'sdh': 'WD-WX72A30AA72R', - 'nvme0n1': '2241E675EA6C', - 'nvme1n1': '2241E675EBE6', - 'sda': '22440F443504', - 'sdb': 'WWZ1SJ18', - 'sdc': '52X0A0D9FZ1G', - 'sdd': '50026B7784446E63', - 'sde': '22440F442105', - 'sdf': 'WRQ0X2GP', - 'sdg': '23Q0A0MPFZ1G', - } - - serial = serial_map.get(device_name, None) - - # Error messages by type - error_messages = { - 'io_error': f'Test I/O error on /dev/{device_name}: sector read failed at LBA 12345678', - 'smart_error': f'/dev/{device_name}: SMART warning - 1 Currently unreadable (pending) sectors detected', - 'fs_error': f'EXT4-fs error (device {device_name}1): inode 123456: block 789012: error reading data', - } - - error_signatures = { - 'io_error': f'io_test_{device_name}', - 'smart_error': f'smart_test_{device_name}', - 'fs_error': f'fs_test_{device_name}', - } - - message = error_messages.get(error_type, f'Test error on /dev/{device_name}') - signature = error_signatures.get(error_type, f'test_{device_name}') - - print(f"\n{'='*60}") - print(f"Testing Disk Observation Recording") - print(f"{'='*60}") - print(f"Device: /dev/{device_name}") - print(f"Serial: {serial or 'Unknown'}") - print(f"Error Type: {error_type}") - print(f"Message: {message}") - print(f"Signature: {signature}") - print(f"{'='*60}\n") - - # Initialize persistence - hp = HealthPersistence() - - # Record the observation - print("[1] Recording observation...") - hp.record_disk_observation( - device_name=device_name, - serial=serial, - error_type=error_type, - error_signature=signature, - raw_message=message, - severity='warning' - ) - print(" OK - Observation recorded\n") - - # Query observations for this device - print("[2] Querying observations for this device...") - observations = hp.get_disk_observations(device_name=device_name, serial=serial) - - if observations: - print(f" Found {len(observations)} observation(s):\n") - for obs in observations: - print(f" ID: {obs['id']}") - print(f" Type: {obs['error_type']}") - print(f" Signature: {obs['error_signature']}") - print(f" Message: {obs['raw_message'][:80]}...") - print(f" Severity: {obs['severity']}") - print(f" First: {obs['first_occurrence']}") - print(f" Last: {obs['last_occurrence']}") - print(f" Count: {obs['occurrence_count']}") - print(f" Dismissed: {obs['dismissed']}") - print() - else: - print(" No observations found!\n") - - # Also show the disk registry - print("[3] Checking disk registry...") - all_devices = hp.get_all_observed_devices() - for dev in all_devices: - if dev.get('device_name') == device_name or dev.get('serial') == serial: - print(f" Found in registry:") - print(f" ID: {dev.get('id')}") - print(f" Device: {dev.get('device_name')}") - print(f" Serial: {dev.get('serial')}") - print(f" First seen: {dev.get('first_seen')}") - print(f" Last seen: {dev.get('last_seen')}") - print() - - print(f"{'='*60}") - print("Test complete! Check the Storage section in the UI.") - print(f"The disk /dev/{device_name} should now show an observations badge.") - print(f"{'='*60}\n") - - -if __name__ == '__main__': - main() diff --git a/AppImage/scripts/test_real_events.sh b/AppImage/scripts/test_real_events.sh deleted file mode 100644 index 16599fee..00000000 --- a/AppImage/scripts/test_real_events.sh +++ /dev/null @@ -1,732 +0,0 @@ -#!/bin/bash -# ============================================================================ -# ProxMenux - Real Proxmox Event Simulator -# ============================================================================ -# This script triggers ACTUAL events on Proxmox so that PVE's notification -# system fires real webhooks through the full pipeline: -# -# PVE event -> PVE notification -> webhook POST -> our pipeline -> Telegram -# -# Unlike test_all_notifications.sh (which injects directly via API), this -# script makes Proxmox generate the events itself. -# -# Usage: -# chmod +x test_real_events.sh -# ./test_real_events.sh # interactive menu -# ./test_real_events.sh disk # run disk tests only -# ./test_real_events.sh backup # run backup tests only -# ./test_real_events.sh all # run all tests -# ============================================================================ - -set -euo pipefail - -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -CYAN='\033[0;36m' -BOLD='\033[1m' -NC='\033[0m' - -API="http://127.0.0.1:8008" -LOG_FILE="/tmp/proxmenux_real_test_$(date +%Y%m%d_%H%M%S).log" - -# ── Helpers ─────────────────────���─────────────────────────────── -log() { echo -e "$1" | tee -a "$LOG_FILE"; } -header() { - echo "" | tee -a "$LOG_FILE" - echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" | tee -a "$LOG_FILE" - echo -e "${BOLD} $1${NC}" | tee -a "$LOG_FILE" - echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" | tee -a "$LOG_FILE" -} - -warn() { log "${YELLOW} [!] $1${NC}"; } -ok() { log "${GREEN} [OK] $1${NC}"; } -fail() { log "${RED} [FAIL] $1${NC}"; } -info() { log "${CYAN} [i] $1${NC}"; } - -confirm() { - echo "" - echo -e "${YELLOW} $1${NC}" - echo -ne " Continue? [Y/n]: " - read -r ans - [[ -z "$ans" || "$ans" =~ ^[Yy] ]] -} - -wait_webhook() { - local seconds=${1:-10} - log " Waiting ${seconds}s for webhook delivery..." - sleep "$seconds" -} - -snapshot_history() { - curl -s "${API}/api/notifications/history?limit=200" 2>/dev/null | python3 -c " -import sys, json -try: - data = json.load(sys.stdin) - count = len(data.get('history', [])) - print(count) -except: - print(0) -" 2>/dev/null || echo "0" -} - -check_new_events() { - local before=$1 - local after - after=$(snapshot_history) - local diff=$((after - before)) - if [ "$diff" -gt 0 ]; then - ok "Received $diff new notification(s) via webhook" - # Show the latest events - curl -s "${API}/api/notifications/history?limit=$((diff + 2))" 2>/dev/null | python3 -c " -import sys, json -data = json.load(sys.stdin) -for h in data.get('history', [])[:$diff]: - sev = h.get('severity', '?') - icon = {'CRITICAL': ' RED', 'WARNING': ' YEL', 'INFO': ' BLU'}.get(sev, ' ???') - print(f'{icon} {h[\"event_type\"]:25s} {h.get(\"title\", \"\")[:60]}') -" 2>/dev/null | tee -a "$LOG_FILE" - else - warn "No new notifications detected (may need more time or check filters)" - fi -} - -# ── Pre-flight checks ────────────────────────────────────────── -preflight() { - header "Pre-flight Checks" - - # Check if running as root - if [ "$(id -u)" -ne 0 ]; then - fail "This script must be run as root" - exit 1 - fi - ok "Running as root" - - # Check ProxMenux is running - if curl -s "${API}/api/health" >/dev/null 2>&1; then - ok "ProxMenux Monitor is running" - else - fail "ProxMenux Monitor not reachable at ${API}" - exit 1 - fi - - # Check webhook is configured by querying PVE directly - if pvesh get /cluster/notifications/endpoints/webhook --output-format json 2>/dev/null | python3 -c " -import sys, json -endpoints = json.load(sys.stdin) -found = any('proxmenux' in e.get('name','').lower() for e in (endpoints if isinstance(endpoints, list) else [endpoints])) -exit(0 if found else 1) -" 2>/dev/null; then - ok "PVE webhook endpoint 'proxmenux-webhook' is configured" - else - warn "PVE webhook may not be configured. Run setup from the UI first." - if ! confirm "Continue anyway?"; then - exit 1 - fi - fi - - # Check notification config - # API returns { config: { enabled: true/false/'true'/'false', ... }, success: true } - if curl -s "${API}/api/notifications/settings" 2>/dev/null | python3 -c " -import sys, json -d = json.load(sys.stdin) -cfg = d.get('config', d) -enabled = cfg.get('enabled', False) -exit(0 if enabled is True or str(enabled).lower() == 'true' else 1) -" 2>/dev/null; then - ok "Notifications are enabled" - else - fail "Notifications are NOT enabled. Enable them in the UI first." - exit 1 - fi - - # Re-run webhook setup to ensure priv config and body template exist - info "Re-configuring PVE webhook (ensures priv config + body template)..." - local setup_result - setup_result=$(curl -s -X POST "${API}/api/notifications/proxmox/setup-webhook" 2>/dev/null) - if echo "$setup_result" | python3 -c "import sys,json; d=json.load(sys.stdin); exit(0 if d.get('configured') else 1)" 2>/dev/null; then - ok "PVE webhook re-configured successfully" - else - local setup_err - setup_err=$(echo "$setup_result" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error','unknown'))" 2>/dev/null) - warn "Webhook setup returned: ${setup_err}" - warn "PVE webhook events may not work. Manual commands below:" - echo "$setup_result" | python3 -c " -import sys, json -d = json.load(sys.stdin) -for cmd in d.get('fallback_commands', []): - print(f' {cmd}') -" 2>/dev/null - if ! confirm "Continue anyway?"; then - exit 1 - fi - fi - - # Find a VM/CT for testing - VMID="" - VMNAME="" - VMTYPE="" - - # Try to find a stopped CT first (safest) - local cts - cts=$(pvesh get /cluster/resources --type vm --output-format json 2>/dev/null || echo "[]") - - # Look for a stopped container - VMID=$(echo "$cts" | python3 -c " -import sys, json -vms = json.load(sys.stdin) -# Prefer stopped CTs, then stopped VMs -for v in sorted(vms, key=lambda x: (0 if x.get('type')=='lxc' else 1, 0 if x.get('status')=='stopped' else 1)): - if v.get('status') == 'stopped': - print(v.get('vmid', '')) - break -" 2>/dev/null || echo "") - - if [ -n "$VMID" ]; then - VMTYPE=$(echo "$cts" | python3 -c " -import sys, json -vms = json.load(sys.stdin) -for v in vms: - if str(v.get('vmid')) == '$VMID': - print(v.get('type', 'qemu')) - break -" 2>/dev/null) - VMNAME=$(echo "$cts" | python3 -c " -import sys, json -vms = json.load(sys.stdin) -for v in vms: - if str(v.get('vmid')) == '$VMID': - print(v.get('name', 'unknown')) - break -" 2>/dev/null) - ok "Found stopped ${VMTYPE} for testing: ${VMID} (${VMNAME})" - else - warn "No stopped VM/CT found. Backup tests will use ID 0 (host backup)." - fi - - # List available storage - info "Available storage:" - pvesh get /storage --output-format json 2>/dev/null | python3 -c " -import sys, json -stores = json.load(sys.stdin) -for s in stores: - sid = s.get('storage', '?') - stype = s.get('type', '?') - content = s.get('content', '?') - print(f' {sid:20s} type={stype:10s} content={content}') -" 2>/dev/null | tee -a "$LOG_FILE" || warn "Could not list storage" - - echo "" - log " Log file: ${LOG_FILE}" -} - -# ============================================================================ -# TEST CATEGORY: DISK ERRORS -# ============================================================================ -test_disk() { - header "DISK ERROR TESTS" - - # ── Test D1: SMART error injection ── - log "" - log "${BOLD} Test D1: SMART error log injection${NC}" - info "Writes a simulated SMART error to syslog so JournalWatcher catches it." - info "This tests the journal -> notification_events -> pipeline flow." - - local before - before=$(snapshot_history) - - # Inject a realistic SMART error into the system journal - logger -t kernel -p kern.err "ata1.00: exception Emask 0x0 SAct 0x0 SErr 0x0 action 0x6 frozen" - sleep 1 - logger -t kernel -p kern.crit "ata1.00: failed command: READ FPDMA QUEUED" - sleep 1 - logger -t smartd -p daemon.warning "Device: /dev/sda [SAT], 1 Currently unreadable (pending) sectors" - - wait_webhook 8 - check_new_events "$before" - - # ── Test D2: ZFS error simulation ── - log "" - log "${BOLD} Test D2: ZFS scrub error simulation${NC}" - - # Check if ZFS is available - if command -v zpool >/dev/null 2>&1; then - local zpools - zpools=$(zpool list -H -o name 2>/dev/null || echo "") - - if [ -n "$zpools" ]; then - local pool - pool=$(echo "$zpools" | head -1) - info "ZFS pool found: ${pool}" - info "Injecting ZFS checksum error into syslog (non-destructive)." - - before=$(snapshot_history) - - # Simulate ZFS error events via syslog (non-destructive) - logger -t kernel -p kern.warning "ZFS: pool '${pool}' has experienced an error" - sleep 1 - logger -t zfs-module -p daemon.err "CHECKSUM error on ${pool}:mirror-0/sda: zio error" - - wait_webhook 8 - check_new_events "$before" - else - warn "ZFS installed but no pools found. Skipping ZFS test." - fi - else - warn "ZFS not installed. Skipping ZFS test." - fi - - # ── Test D3: Filesystem space pressure ── - log "" - log "${BOLD} Test D3: Disk space pressure simulation${NC}" - info "Creates a large temporary file to fill disk, triggering space warnings." - info "The Health Monitor should detect low disk space within ~60s." - - # Check current free space on / - local free_pct - free_pct=$(df / | tail -1 | awk '{print 100-$5}' | tr -d '%') - info "Current free space on /: ${free_pct}%" - - if [ "$free_pct" -gt 15 ]; then - info "Disk has ${free_pct}% free. Need to reduce below threshold for test." - - # Calculate how much to fill (leave only 8% free) - local total_k free_k fill_k - total_k=$(df / | tail -1 | awk '{print $2}') - free_k=$(df / | tail -1 | awk '{print $4}') - fill_k=$((free_k - (total_k * 8 / 100))) - - if [ "$fill_k" -gt 0 ] && [ "$fill_k" -lt 50000000 ]; then - info "Will create ${fill_k}KB temp file to simulate low space." - - if confirm "This will temporarily fill disk to ~92% on /. Safe to proceed?"; then - before=$(snapshot_history) - - dd if=/dev/zero of=/tmp/.proxmenux_disk_test bs=1024 count="$fill_k" 2>/dev/null || true - ok "Temp file created. Disk pressure active." - info "Waiting 90s for Health Monitor to detect low space..." - - # Wait for health monitor polling cycle - for i in $(seq 1 9); do - echo -ne "\r Waiting... ${i}0/90s" - sleep 10 - done - echo "" - - # Clean up immediately - rm -f /tmp/.proxmenux_disk_test - ok "Temp file removed. Disk space restored." - - check_new_events "$before" - else - warn "Skipped disk pressure test." - fi - else - warn "Cannot safely fill disk (would need ${fill_k}KB). Skipping." - fi - else - warn "Disk already at ${free_pct}% free. Health Monitor may already be alerting." - fi - - # ── Test D4: I/O error in syslog ── - log "" - log "${BOLD} Test D4: Generic I/O error injection${NC}" - info "Injects I/O errors into syslog for JournalWatcher." - - before=$(snapshot_history) - - logger -t kernel -p kern.err "Buffer I/O error on dev sdb1, logical block 0, async page read" - sleep 1 - logger -t kernel -p kern.err "EXT4-fs error (device sdb1): ext4_find_entry:1455: inode #2: comm ls: reading directory lblock 0" - - wait_webhook 8 - check_new_events "$before" -} - -# ============================================================================ -# TEST CATEGORY: BACKUP EVENTS -# ============================================================================ -test_backup() { - header "BACKUP EVENT TESTS" - - local backup_storage="" - - # Find backup-capable storage - backup_storage=$(pvesh get /storage --output-format json 2>/dev/null | python3 -c " -import sys, json -stores = json.load(sys.stdin) -for s in stores: - content = s.get('content', '') - if 'backup' in content or 'vztmpl' in content: - print(s.get('storage', '')) - break -# Fallback: try 'local' -else: - for s in stores: - if s.get('storage') == 'local': - print('local') - break -" 2>/dev/null || echo "local") - - info "Using backup storage: ${backup_storage}" - - # ── Test B1: Successful vzdump backup ── - if [ -n "$VMID" ]; then - log "" - log "${BOLD} Test B1: Real vzdump backup (success)${NC}" - info "Running a real vzdump backup of ${VMTYPE} ${VMID} (${VMNAME})." - info "This triggers PVE's notification system with a real backup event." - - if confirm "This will backup ${VMTYPE} ${VMID} to '${backup_storage}'. Proceed?"; then - local before - before=$(snapshot_history) - - # Use snapshot mode for VMs (non-disruptive), stop mode for CTs - local bmode="snapshot" - if [ "$VMTYPE" = "lxc" ]; then - bmode="suspend" - fi - - info "Starting vzdump (mode=${bmode}, compress=zstd)..." - if vzdump "$VMID" --storage "$backup_storage" --mode "$bmode" --compress zstd --notes-template "ProxMenux test backup" 2>&1 | tee -a "$LOG_FILE"; then - ok "vzdump completed successfully!" - else - warn "vzdump returned non-zero (check output above)" - fi - - wait_webhook 12 - check_new_events "$before" - - # Clean up the test backup - info "Cleaning up test backup file..." - local latest_bak - latest_bak=$(find "/var/lib/vz/dump/" -name "vzdump-*-${VMID}-*" -type f -newer /tmp/.proxmenux_bak_marker 2>/dev/null | head -1 || echo "") - # Create a marker for cleanup - touch /tmp/.proxmenux_bak_marker 2>/dev/null || true - else - warn "Skipped backup success test." - fi - - # ── Test B2: Failed vzdump backup ── - log "" - log "${BOLD} Test B2: vzdump backup failure (invalid storage)${NC}" - info "Attempting backup to non-existent storage to trigger a backup failure event." - - before=$(snapshot_history) - - # This WILL fail because the storage doesn't exist - info "Starting vzdump to fake storage (will fail intentionally)..." - vzdump "$VMID" --storage "nonexistent_storage_12345" --mode snapshot 2>&1 | tail -5 | tee -a "$LOG_FILE" || true - - warn "vzdump failed as expected (this is intentional)." - - wait_webhook 12 - check_new_events "$before" - - else - warn "No VM/CT available for backup tests." - info "You can create a minimal LXC container for testing:" - info " pct create 9999 local:vztmpl/debian-12-standard_12.2-1_amd64.tar.zst --storage local-lvm --memory 128 --cores 1" - fi - - # ── Test B3: Snapshot create/delete ── - if [ -n "$VMID" ] && [ "$VMTYPE" = "qemu" ]; then - log "" - log "${BOLD} Test B3: VM Snapshot create & delete${NC}" - info "Creating a snapshot of VM ${VMID} to test snapshot events." - - if confirm "Create snapshot 'proxmenux_test' on VM ${VMID}?"; then - local before - before=$(snapshot_history) - - if qm snapshot "$VMID" proxmenux_test --description "ProxMenux test snapshot" 2>&1 | tee -a "$LOG_FILE"; then - ok "Snapshot created!" - else - warn "Snapshot creation returned non-zero" - fi - - wait_webhook 10 - check_new_events "$before" - - # Clean up snapshot - info "Cleaning up test snapshot..." - qm delsnapshot "$VMID" proxmenux_test 2>/dev/null || true - ok "Snapshot removed." - fi - elif [ -n "$VMID" ] && [ "$VMTYPE" = "lxc" ]; then - log "" - log "${BOLD} Test B3: CT Snapshot create & delete${NC}" - info "Creating a snapshot of CT ${VMID}." - - if confirm "Create snapshot 'proxmenux_test' on CT ${VMID}?"; then - local before - before=$(snapshot_history) - - if pct snapshot "$VMID" proxmenux_test --description "ProxMenux test snapshot" 2>&1 | tee -a "$LOG_FILE"; then - ok "Snapshot created!" - else - warn "Snapshot creation returned non-zero" - fi - - wait_webhook 10 - check_new_events "$before" - - # Clean up - info "Cleaning up test snapshot..." - pct delsnapshot "$VMID" proxmenux_test 2>/dev/null || true - ok "Snapshot removed." - fi - fi - - # ── Test B4: PVE scheduled backup notification ── - log "" - log "${BOLD} Test B4: Trigger PVE notification system directly${NC}" - info "Using 'pvesh create /notifications/endpoints/...' to test PVE's own system." - info "This sends a test notification through PVE, which should hit our webhook." - - local before - before=$(snapshot_history) - - # PVE 8.x has a test endpoint for notifications - if pvesh create /notifications/targets/test --target proxmenux-webhook 2>&1 | tee -a "$LOG_FILE"; then - ok "PVE test notification sent!" - else - # Try alternative method - info "Direct test not available. Trying via API..." - pvesh set /notifications/endpoints/webhook/proxmenux-webhook --test 1 2>/dev/null || \ - warn "Could not send PVE test notification (requires PVE 8.1+)" - fi - - wait_webhook 8 - check_new_events "$before" -} - -# ============================================================================ -# TEST CATEGORY: VM/CT LIFECYCLE -# ============================================================================ -test_vmct() { - header "VM/CT LIFECYCLE TESTS" - - if [ -z "$VMID" ]; then - warn "No stopped VM/CT found for lifecycle tests." - info "Create a minimal CT: pct create 9999 local:vztmpl/debian-12-standard_12.2-1_amd64.tar.zst --storage local-lvm --memory 128 --cores 1" - return - fi - - log "" - log "${BOLD} Test V1: Start ${VMTYPE} ${VMID} (${VMNAME})${NC}" - - if confirm "Start ${VMTYPE} ${VMID}? It will be stopped again after the test."; then - local before - before=$(snapshot_history) - - if [ "$VMTYPE" = "lxc" ]; then - pct start "$VMID" 2>&1 | tee -a "$LOG_FILE" || true - else - qm start "$VMID" 2>&1 | tee -a "$LOG_FILE" || true - fi - - ok "Start command sent." - wait_webhook 10 - check_new_events "$before" - - # Wait a moment - sleep 5 - - # ── Test V2: Stop ── - log "" - log "${BOLD} Test V2: Stop ${VMTYPE} ${VMID}${NC}" - - before=$(snapshot_history) - - if [ "$VMTYPE" = "lxc" ]; then - pct stop "$VMID" 2>&1 | tee -a "$LOG_FILE" || true - else - qm stop "$VMID" 2>&1 | tee -a "$LOG_FILE" || true - fi - - ok "Stop command sent." - wait_webhook 10 - check_new_events "$before" - fi -} - -# ============================================================================ -# TEST CATEGORY: SYSTEM EVENTS (via syslog injection) -# ============================================================================ -test_system() { - header "SYSTEM EVENT TESTS (syslog injection)" - - # ── Test S1: Authentication failures ── - log "" - log "${BOLD} Test S1: SSH auth failure injection${NC}" - info "Injecting SSH auth failure messages into syslog." - - local before - before=$(snapshot_history) - - logger -t sshd -p auth.warning "Failed password for root from 192.168.1.200 port 44312 ssh2" - sleep 2 - logger -t sshd -p auth.warning "Failed password for invalid user admin from 10.0.0.50 port 55123 ssh2" - sleep 2 - logger -t sshd -p auth.warning "Failed password for root from 192.168.1.200 port 44315 ssh2" - - wait_webhook 8 - check_new_events "$before" - - # ── Test S2: Firewall event ── - log "" - log "${BOLD} Test S2: Firewall drop event${NC}" - - before=$(snapshot_history) - - logger -t kernel -p kern.warning "pve-fw-reject: IN=vmbr0 OUT= MAC=00:11:22:33:44:55 SRC=10.0.0.99 DST=192.168.1.1 PROTO=TCP DPT=22 REJECT" - sleep 2 - logger -t pvefw -p daemon.warning "firewall: blocked incoming connection from 10.0.0.99:45678 to 192.168.1.1:8006" - - wait_webhook 8 - check_new_events "$before" - - # ── Test S3: Service failure ── - log "" - log "${BOLD} Test S3: Service failure injection${NC}" - - before=$(snapshot_history) - - logger -t systemd -p daemon.err "pvedaemon.service: Main process exited, code=exited, status=1/FAILURE" - sleep 1 - logger -t systemd -p daemon.err "Failed to start Proxmox VE API Daemon." - - wait_webhook 8 - check_new_events "$before" -} - -# ============================================================================ -# SUMMARY & REPORT -# ============================================================================ -show_summary() { - header "TEST SUMMARY" - - info "Fetching full notification history..." - echo "" - - curl -s "${API}/api/notifications/history?limit=200" 2>/dev/null | python3 -c " -import sys, json -from collections import Counter - -data = json.load(sys.stdin) -history = data.get('history', []) - -if not history: - print(' No notifications in history.') - sys.exit(0) - -# Group by event_type -by_type = Counter(h['event_type'] for h in history) -# Group by severity -by_sev = Counter(h.get('severity', '?') for h in history) -# Group by source -by_src = Counter(h.get('source', '?') for h in history) - -print(f' Total notifications: {len(history)}') -print() - -sev_icons = {'CRITICAL': '\033[0;31mCRITICAL\033[0m', 'WARNING': '\033[1;33mWARNING\033[0m', 'INFO': '\033[0;36mINFO\033[0m'} -print(' By severity:') -for sev, count in by_sev.most_common(): - icon = sev_icons.get(sev, sev) - print(f' {icon}: {count}') - -print() -print(' By source:') -for src, count in by_src.most_common(): - print(f' {src:20s}: {count}') - -print() -print(' By event type:') -for etype, count in by_type.most_common(): - print(f' {etype:30s}: {count}') - -print() -print(' Latest 15 events:') -for h in history[:15]: - sev = h.get('severity', '?') - icon = {'CRITICAL': ' \033[0;31mRED\033[0m', 'WARNING': ' \033[1;33mYEL\033[0m', 'INFO': ' \033[0;36mBLU\033[0m'}.get(sev, ' ???') - ts = h.get('sent_at', '?')[:19] - src = h.get('source', '?')[:12] - print(f' {icon} {ts} {src:12s} {h[\"event_type\"]:25s} {h.get(\"title\", \"\")[:50]}') -" 2>/dev/null | tee -a "$LOG_FILE" - - echo "" - info "Full log saved to: ${LOG_FILE}" - echo "" - info "To see all history:" - echo -e " ${CYAN}curl -s '${API}/api/notifications/history?limit=200' | python3 -m json.tool${NC}" - echo "" - info "To check Telegram delivery, look at your Telegram bot chat." -} - -# ============================================================================ -# INTERACTIVE MENU -# ============================================================================ -show_menu() { - echo "" - echo -e "${BOLD} ProxMenux Real Event Test Suite${NC}" - echo "" - echo -e " ${CYAN}1)${NC} Disk error tests (SMART, ZFS, I/O, space pressure)" - echo -e " ${CYAN}2)${NC} Backup tests (vzdump success/fail, snapshots)" - echo -e " ${CYAN}3)${NC} VM/CT lifecycle tests (start/stop real VMs)" - echo -e " ${CYAN}4)${NC} System event tests (auth, firewall, service failures)" - echo -e " ${CYAN}5)${NC} Run ALL tests" - echo -e " ${CYAN}6)${NC} Show summary report" - echo -e " ${CYAN}q)${NC} Exit" - echo "" - echo -ne " Select: " -} - -# ── Main ──────────────────────────────────────────────────────── -main() { - local mode="${1:-menu}" - - echo "" - echo -e "${BOLD}============================================================${NC}" - echo -e "${BOLD} ProxMenux - Real Proxmox Event Simulator${NC}" - echo -e "${BOLD}============================================================${NC}" - echo -e " Tests REAL events through the full PVE -> webhook pipeline." - echo -e " Log file: ${CYAN}${LOG_FILE}${NC}" - echo "" - - preflight - - case "$mode" in - disk) test_disk; show_summary ;; - backup) test_backup; show_summary ;; - vmct) test_vmct; show_summary ;; - system) test_system; show_summary ;; - all) - test_disk - test_backup - test_vmct - test_system - show_summary - ;; - menu|*) - while true; do - show_menu - read -r choice - case "$choice" in - 1) test_disk ;; - 2) test_backup ;; - 3) test_vmct ;; - 4) test_system ;; - 5) test_disk; test_backup; test_vmct; test_system; show_summary; break ;; - 6) show_summary ;; - q|Q) echo " Bye!"; break ;; - *) warn "Invalid option" ;; - esac - done - ;; - esac -} - -main "${1:-menu}" diff --git a/AppImage/scripts/shutdown-notify.sh b/scripts/shutdown-notify.sh similarity index 100% rename from AppImage/scripts/shutdown-notify.sh rename to scripts/shutdown-notify.sh