Update notification service

2026-05-25 18:04:43 +00:00 · 2026-03-25 20:12:08 +01:00
parent d53c1dc402
commit 68872d0e06
8 changed files with 6 additions and 1352 deletions
@@ -99,8 +99,6 @@ cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null ||
 cp "$SCRIPT_DIR/oci_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  oci_manager.py not found"
 cp "$SCRIPT_DIR/flask_oci_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  flask_oci_routes.py not found"
 cp "$SCRIPT_DIR/oci/description_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  description_templates.py not found"
-cp "$SCRIPT_DIR/shutdown-notify.sh" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  shutdown-notify.sh not found"
-chmod +x "$APP_DIR/usr/bin/shutdown-notify.sh" 2>/dev/null || true

 # Copy AI providers module for notification enhancement
 echo "📋 Copying AI providers module..."
@@ -1325,7 +1325,7 @@ class HealthPersistence:
            print(f"[HealthPersistence] Error recording UNKNOWN persistent: {e}")


-    # ───────────────────────────────────────────────────────────────��
+    # ────────────────────────────────────────────────────────────────
    #  Disk Observations API
    # ────────────────────────────────────────────────────────────────

@@ -37,7 +37,7 @@ class _SharedState:
    
    Two separate grace periods:
    - startup_vm_grace: Time to aggregate VM/CT starts (shorter, 2 min)
-    - startup_health_grace: Time to suppress transient health errors (longer, 3 min)
+    - startup_health_grace: Time to suppress transient health errors (longer, 5 min)
    """
    def __init__(self):
        self._lock = threading.Lock()
@@ -45,7 +45,7 @@ class _SharedState:
        self._shutdown_grace = 120  # suppress VM/CT stops for 2 minutes after shutdown detected
        self._startup_time: float = time.time()  # when module was loaded (service start)
        self._startup_vm_grace = 120  # aggregate VM/CT starts for 2 minutes after startup
-        self._startup_health_grace = 180  # suppress health warnings for 3 minutes after startup
+        self._startup_health_grace = 300  # suppress health warnings for 5 minutes after startup
        self._startup_vms: list = []  # [(vmid, vmname, 'vm'|'ct'), ...]
        self._startup_aggregated = False  # have we already sent the aggregated message?
    
@@ -67,10 +67,10 @@ class _SharedState:
            return (time.time() - self._startup_time) < self._startup_vm_grace
    
    def is_startup_health_grace(self) -> bool:
-        """Check if we're within the startup health grace period (3 min).
+        """Check if we're within the startup health grace period (5 min).
        
        Used by PollingCollector to suppress transient health warnings
-        (QMP timeout, storage not ready, etc.) during system boot.
+        (QMP timeout, storage not ready, high latency, etc.) during system boot.
        """
        with self._lock:
            return (time.time() - self._startup_time) < self._startup_health_grace
@@ -1064,7 +1064,7 @@ def get_default_enabled_events() -> Dict[str, bool]:
    }


-# ─── Emoji Enrichment (per-channel opt-in) ──────────────────────
+# ──�� Emoji Enrichment (per-channel opt-in) ──────────────────────

 # Category-level header icons
 CATEGORY_EMOJI = {
@@ -1,481 +0,0 @@
-#!/bin/bash
-# ============================================================================
-# ProxMenux Notification System - Complete Test Suite
-# ============================================================================
-# 
-# Usage:
-#   chmod +x test_all_notifications.sh
-#   ./test_all_notifications.sh              # Run ALL tests (with 3s pause between)
-#   ./test_all_notifications.sh system       # Run only System category
-#   ./test_all_notifications.sh vm_ct        # Run only VM/CT category
-#   ./test_all_notifications.sh backup       # Run only Backup category
-#   ./test_all_notifications.sh resources    # Run only Resources category
-#   ./test_all_notifications.sh storage      # Run only Storage category
-#   ./test_all_notifications.sh network      # Run only Network category
-#   ./test_all_notifications.sh security     # Run only Security category
-#   ./test_all_notifications.sh cluster      # Run only Cluster category
-#   ./test_all_notifications.sh burst        # Run only Burst aggregation tests
-#
-# Each test sends a simulated webhook to the local notification endpoint.
-# Check your Telegram/Gotify/Discord/Email for the notifications.
-# ============================================================================
-
-API="http://127.0.0.1:8008/api/notifications/webhook"
-PAUSE=3  # seconds between tests
-
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-CYAN='\033[0;36m'
-NC='\033[0m' # No Color
-BOLD='\033[1m'
-
-test_count=0
-pass_count=0
-fail_count=0
-
-send_test() {
-    local name="$1"
-    local payload="$2"
-    test_count=$((test_count + 1))
-    
-    echo -e "${CYAN}  [$test_count] ${BOLD}$name${NC}"
-    
-    response=$(curl -s -w "\n%{http_code}" -X POST "$API" \
-        -H "Content-Type: application/json" \
-        -d "$payload" 2>&1)
-    
-    http_code=$(echo "$response" | tail -1)
-    body=$(echo "$response" | head -n -1)
-    
-    if [ "$http_code" = "200" ] || [ "$http_code" = "202" ]; then
-        echo -e "    ${GREEN}HTTP $http_code${NC} - $body"
-        pass_count=$((pass_count + 1))
-    else
-        echo -e "    ${RED}HTTP $http_code${NC} - $body"
-        fail_count=$((fail_count + 1))
-    fi
-    
-    sleep "$PAUSE"
-}
-
-# ============================================================================
-# SYSTEM CATEGORY (group: system)
-# ============================================================================
-test_system() {
-    echo ""
-    echo -e "${YELLOW}========================================${NC}"
-    echo -e "${YELLOW}  SYSTEM - Startup, shutdown, kernel${NC}"
-    echo -e "${YELLOW}========================================${NC}"
-    echo ""
-    
-    # 1. state_change (disabled by default -- test to verify it does NOT arrive)
-    send_test "state_change (should NOT arrive - disabled by default)" \
-        '{"type":"state_change","component":"health","severity":"warning","title":"overall changed to WARNING","body":"overall status changed from OK to WARNING."}'
-    
-    # 2. new_error
-    send_test "new_error" \
-        '{"type":"new_error","component":"health","severity":"warning","title":"New WARNING - cpu","body":"CPU usage exceeds 90% for more than 5 minutes","category":"cpu"}'
-    
-    # 3. error_resolved
-    send_test "error_resolved" \
-        '{"type":"error_resolved","component":"health","severity":"info","title":"Resolved - cpu","body":"CPU usage returned to normal.\nDuration: 15 minutes","category":"cpu","duration":"15 minutes"}'
-    
-    # 4. error_escalated
-    send_test "error_escalated" \
-        '{"type":"error_escalated","component":"health","severity":"critical","title":"Escalated to CRITICAL - memory","body":"Memory usage exceeded 95% and swap is active","category":"memory"}'
-    
-    # 5. system_shutdown
-    send_test "system_shutdown" \
-        '{"type":"system_shutdown","component":"system","severity":"warning","title":"System shutting down","body":"The system is shutting down.\nUser initiated shutdown."}'
-    
-    # 6. system_reboot
-    send_test "system_reboot" \
-        '{"type":"system_reboot","component":"system","severity":"warning","title":"System rebooting","body":"The system is rebooting.\nKernel update applied."}'
-    
-    # 7. system_problem
-    send_test "system_problem" \
-        '{"type":"system_problem","component":"system","severity":"critical","title":"System problem detected","body":"Kernel panic: Attempted to kill init! exitcode=0x00000009"}'
-    
-    # 8. service_fail
-    send_test "service_fail" \
-        '{"type":"service_fail","component":"systemd","severity":"warning","title":"Service failed - pvedaemon","body":"Service pvedaemon has failed.\nUnit pvedaemon.service entered failed state.","service_name":"pvedaemon"}'
-    
-    # 9. update_available (legacy, superseded by update_summary)
-    send_test "update_available" \
-        '{"type":"update_available","component":"apt","severity":"info","title":"Updates available","body":"Total updates: 12\nSecurity: 3\nProxmox: 5\nKernel: 1\nImportant: pve-manager (8.3.5 -> 8.4.1)","total_count":"12","security_count":"3","pve_count":"5","kernel_count":"1","important_list":"pve-manager (8.3.5 -> 8.4.1)"}'
-    
-    # 10. update_complete
-    send_test "update_complete" \
-        '{"type":"update_complete","component":"apt","severity":"info","title":"Update completed","body":"12 packages updated successfully."}'
-    
-    # 11. unknown_persistent
-    send_test "unknown_persistent" \
-        '{"type":"unknown_persistent","component":"health","severity":"warning","title":"Check unavailable - temperature","body":"Health check for temperature has been unavailable for 3+ cycles.\nSensor not responding.","category":"temperature"}'
-    
-    # 12. health_persistent
-    send_test "health_persistent" \
-        '{"type":"health_persistent","component":"health","severity":"warning","title":"3 active health issue(s)","body":"The following health issues remain active:\n- CPU at 92%\n- Memory at 88%\n- Disk /dev/sda at 94%\n\nThis digest is sent once every 24 hours while issues persist.","count":"3"}'
-    
-    # 13. health_issue_new
-    send_test "health_issue_new" \
-        '{"type":"health_issue_new","component":"health","severity":"warning","title":"New health issue - disk","body":"New WARNING issue detected:\nDisk /dev/sda usage at 94%","category":"disk"}'
-    
-    # 14. health_issue_resolved
-    send_test "health_issue_resolved" \
-        '{"type":"health_issue_resolved","component":"health","severity":"info","title":"Resolved - disk","body":"disk issue has been resolved.\nDisk usage dropped to 72%.\nDuration: 3 hours","category":"disk","duration":"3 hours"}'
-    
-    # 15. update_summary
-    send_test "update_summary" \
-        '{"type":"update_summary","component":"apt","severity":"info","title":"Updates available","body":"Total updates: 70\nSecurity updates: 9\nProxmox-related updates: 24\nKernel updates: 1\nImportant packages: pve-manager (8.3.5 -> 8.4.1), proxmox-ve (8.3.0 -> 8.4.0), qemu-server (8.3.8 -> 8.4.2)","total_count":"70","security_count":"9","pve_count":"24","kernel_count":"1","important_list":"pve-manager (8.3.5 -> 8.4.1), proxmox-ve (8.3.0 -> 8.4.0), qemu-server (8.3.8 -> 8.4.2)"}'
-    
-    # 16. pve_update
-    send_test "pve_update" \
-        '{"type":"pve_update","component":"apt","severity":"info","title":"Proxmox VE 8.4.1 available","body":"Proxmox VE 8.3.5 -> 8.4.1\npve-manager 8.3.5 -> 8.4.1","current_version":"8.3.5","new_version":"8.4.1","version":"8.4.1","details":"pve-manager 8.3.5 -> 8.4.1"}'
-}
-
-# ============================================================================
-# VM / CT CATEGORY (group: vm_ct)
-# ============================================================================
-test_vm_ct() {
-    echo ""
-    echo -e "${YELLOW}========================================${NC}"
-    echo -e "${YELLOW}  VM / CT - Start, stop, crash, migration${NC}"
-    echo -e "${YELLOW}========================================${NC}"
-    echo ""
-    
-    # 1. vm_start
-    send_test "vm_start" \
-        '{"type":"vm_start","component":"qemu","severity":"info","title":"VM 100 started","body":"ubuntu-server (100) has been started.","vmid":"100","vmname":"ubuntu-server"}'
-    
-    # 2. vm_stop
-    send_test "vm_stop" \
-        '{"type":"vm_stop","component":"qemu","severity":"info","title":"VM 100 stopped","body":"ubuntu-server (100) has been stopped.","vmid":"100","vmname":"ubuntu-server"}'
-    
-    # 3. vm_shutdown
-    send_test "vm_shutdown" \
-        '{"type":"vm_shutdown","component":"qemu","severity":"info","title":"VM 100 shutdown","body":"ubuntu-server (100) has been shut down.","vmid":"100","vmname":"ubuntu-server"}'
-    
-    # 4. vm_fail
-    send_test "vm_fail" \
-        '{"type":"vm_fail","component":"qemu","severity":"critical","title":"VM 100 FAILED","body":"ubuntu-server (100) has failed.\nKVM: internal error: unexpected exit to hypervisor","vmid":"100","vmname":"ubuntu-server","reason":"KVM: internal error: unexpected exit to hypervisor"}'
-    
-    # 5. vm_restart
-    send_test "vm_restart" \
-        '{"type":"vm_restart","component":"qemu","severity":"info","title":"VM 100 restarted","body":"ubuntu-server (100) has been restarted.","vmid":"100","vmname":"ubuntu-server"}'
-    
-    # 6. ct_start
-    send_test "ct_start" \
-        '{"type":"ct_start","component":"lxc","severity":"info","title":"CT 200 started","body":"nginx-proxy (200) has been started.","vmid":"200","vmname":"nginx-proxy"}'
-    
-    # 7. ct_stop
-    send_test "ct_stop" \
-        '{"type":"ct_stop","component":"lxc","severity":"info","title":"CT 200 stopped","body":"nginx-proxy (200) has been stopped.","vmid":"200","vmname":"nginx-proxy"}'
-    
-    # 8. ct_fail
-    send_test "ct_fail" \
-        '{"type":"ct_fail","component":"lxc","severity":"critical","title":"CT 200 FAILED","body":"nginx-proxy (200) has failed.\nContainer exited with error code 137","vmid":"200","vmname":"nginx-proxy","reason":"Container exited with error code 137"}'
-    
-    # 9. migration_start
-    send_test "migration_start" \
-        '{"type":"migration_start","component":"qemu","severity":"info","title":"Migration started - 100","body":"ubuntu-server (100) migration to pve-node2 started.","vmid":"100","vmname":"ubuntu-server","target_node":"pve-node2"}'
-    
-    # 10. migration_complete
-    send_test "migration_complete" \
-        '{"type":"migration_complete","component":"qemu","severity":"info","title":"Migration complete - 100","body":"ubuntu-server (100) migrated successfully to pve-node2.","vmid":"100","vmname":"ubuntu-server","target_node":"pve-node2"}'
-    
-    # 11. migration_fail
-    send_test "migration_fail" \
-        '{"type":"migration_fail","component":"qemu","severity":"critical","title":"Migration FAILED - 100","body":"ubuntu-server (100) migration to pve-node2 failed.\nNetwork timeout during memory transfer","vmid":"100","vmname":"ubuntu-server","target_node":"pve-node2","reason":"Network timeout during memory transfer"}'
-    
-    # 12. replication_fail
-    send_test "replication_fail" \
-        '{"type":"replication_fail","component":"replication","severity":"critical","title":"Replication FAILED - 100","body":"Replication of ubuntu-server (100) has failed.\nTarget storage unreachable","vmid":"100","vmname":"ubuntu-server","reason":"Target storage unreachable"}'
-    
-    # 13. replication_complete
-    send_test "replication_complete" \
-        '{"type":"replication_complete","component":"replication","severity":"info","title":"Replication complete - 100","body":"Replication of ubuntu-server (100) completed successfully.","vmid":"100","vmname":"ubuntu-server"}'
-}
-
-# ============================================================================
-# BACKUP CATEGORY (group: backup)
-# ============================================================================
-test_backup() {
-    echo ""
-    echo -e "${YELLOW}========================================${NC}"
-    echo -e "${YELLOW}  BACKUPS - Backup start, complete, fail${NC}"
-    echo -e "${YELLOW}========================================${NC}"
-    echo ""
-    
-    # 1. backup_start
-    send_test "backup_start" \
-        '{"type":"backup_start","component":"vzdump","severity":"info","title":"Backup started - 100","body":"Backup of ubuntu-server (100) has started.","vmid":"100","vmname":"ubuntu-server"}'
-    
-    # 2. backup_complete
-    send_test "backup_complete" \
-        '{"type":"backup_complete","component":"vzdump","severity":"info","title":"Backup complete - 100","body":"Backup of ubuntu-server (100) completed successfully.\nSize: 12.4 GB","vmid":"100","vmname":"ubuntu-server","size":"12.4 GB"}'
-    
-    # 3. backup_fail
-    send_test "backup_fail" \
-        '{"type":"backup_fail","component":"vzdump","severity":"critical","title":"Backup FAILED - 100","body":"Backup of ubuntu-server (100) has failed.\nStorage local-lvm is full","vmid":"100","vmname":"ubuntu-server","reason":"Storage local-lvm is full"}'
-    
-    # 4. snapshot_complete
-    send_test "snapshot_complete" \
-        '{"type":"snapshot_complete","component":"qemu","severity":"info","title":"Snapshot created - 100","body":"Snapshot of ubuntu-server (100) created: pre-upgrade-2026","vmid":"100","vmname":"ubuntu-server","snapshot_name":"pre-upgrade-2026"}'
-    
-    # 5. snapshot_fail
-    send_test "snapshot_fail" \
-        '{"type":"snapshot_fail","component":"qemu","severity":"critical","title":"Snapshot FAILED - 100","body":"Snapshot of ubuntu-server (100) failed.\nInsufficient space on storage","vmid":"100","vmname":"ubuntu-server","reason":"Insufficient space on storage"}'
-}
-
-# ============================================================================
-# RESOURCES CATEGORY (group: resources)
-# ============================================================================
-test_resources() {
-    echo ""
-    echo -e "${YELLOW}========================================${NC}"
-    echo -e "${YELLOW}  RESOURCES - CPU, memory, temperature${NC}"
-    echo -e "${YELLOW}========================================${NC}"
-    echo ""
-    
-    # 1. cpu_high
-    send_test "cpu_high" \
-        '{"type":"cpu_high","component":"health","severity":"warning","title":"High CPU usage (94%)","body":"CPU usage is at 94% on 16 cores.\nTop process: kvm (VM 100)","value":"94","cores":"16","details":"Top process: kvm (VM 100)"}'
-    
-    # 2. ram_high
-    send_test "ram_high" \
-        '{"type":"ram_high","component":"health","severity":"warning","title":"High memory usage (91%)","body":"Memory usage: 58.2 GB / 64 GB (91%).\n4 VMs running, swap at 2.1 GB","value":"91","used":"58.2 GB","total":"64 GB","details":"4 VMs running, swap at 2.1 GB"}'
-    
-    # 3. temp_high
-    send_test "temp_high" \
-        '{"type":"temp_high","component":"health","severity":"critical","title":"High temperature (89C)","body":"CPU temperature: 89C (threshold: 80C).\nCheck cooling system immediately","value":"89","threshold":"80","details":"Check cooling system immediately"}'
-    
-    # 4. load_high
-    send_test "load_high" \
-        '{"type":"load_high","component":"health","severity":"warning","title":"High system load (24.5)","body":"System load average: 24.5 on 16 cores.\nI/O wait: 35%","value":"24.5","cores":"16","details":"I/O wait: 35%"}'
-}
-
-# ============================================================================
-# STORAGE CATEGORY (group: storage)
-# ============================================================================
-test_storage() {
-    echo ""
-    echo -e "${YELLOW}========================================${NC}"
-    echo -e "${YELLOW}  STORAGE - Disk space, I/O errors, SMART${NC}"
-    echo -e "${YELLOW}========================================${NC}"
-    echo ""
-    
-    # 1. disk_space_low
-    send_test "disk_space_low" \
-        '{"type":"disk_space_low","component":"storage","severity":"warning","title":"Low disk space on /var","body":"/var: 93% used (4.2 GB available).","mount":"/var","used":"93","available":"4.2 GB"}'
-    
-    # 2. disk_io_error
-    send_test "disk_io_error" \
-        '{"type":"disk_io_error","component":"smart","severity":"critical","title":"Disk I/O error","body":"I/O error detected on /dev/sdb.\nSMART error: Current Pending Sector Count = 8","device":"/dev/sdb","reason":"SMART error: Current Pending Sector Count = 8"}'
-    
-    # 3. burst_disk_io
-    send_test "burst_disk_io" \
-        '{"type":"burst_disk_io","component":"storage","severity":"critical","title":"5 disk I/O errors on /dev/sdb, /dev/sdc","body":"5 I/O errors detected in 60s.\nDevices: /dev/sdb, /dev/sdc","count":"5","window":"60s","entity_list":"/dev/sdb, /dev/sdc"}'
-}
-
-# ============================================================================
-# NETWORK CATEGORY (group: network)
-# ============================================================================
-test_network() {
-    echo ""
-    echo -e "${YELLOW}========================================${NC}"
-    echo -e "${YELLOW}  NETWORK - Connectivity, bond, latency${NC}"
-    echo -e "${YELLOW}========================================${NC}"
-    echo ""
-    
-    # 1. network_down
-    send_test "network_down" \
-        '{"type":"network_down","component":"network","severity":"critical","title":"Network connectivity lost","body":"Network connectivity check failed.\nGateway 192.168.1.1 unreachable. Bond vmbr0 degraded.","reason":"Gateway 192.168.1.1 unreachable. Bond vmbr0 degraded."}'
-    
-    # 2. network_latency
-    send_test "network_latency" \
-        '{"type":"network_latency","component":"network","severity":"warning","title":"High network latency (450ms)","body":"Latency to gateway: 450ms (threshold: 100ms).","value":"450","threshold":"100"}'
-}
-
-# ============================================================================
-# SECURITY CATEGORY (group: security)
-# ============================================================================
-test_security() {
-    echo ""
-    echo -e "${YELLOW}========================================${NC}"
-    echo -e "${YELLOW}  SECURITY - Auth failures, fail2ban, firewall${NC}"
-    echo -e "${YELLOW}========================================${NC}"
-    echo ""
-    
-    # 1. auth_fail
-    send_test "auth_fail" \
-        '{"type":"auth_fail","component":"auth","severity":"warning","title":"Authentication failure","body":"Failed login attempt from 203.0.113.42.\nUser: root\nService: sshd","source_ip":"203.0.113.42","username":"root","service":"sshd"}'
-    
-    # 2. ip_block
-    send_test "ip_block" \
-        '{"type":"ip_block","component":"security","severity":"info","title":"IP blocked by Fail2Ban","body":"IP 203.0.113.42 has been banned.\nJail: sshd\nFailures: 5","source_ip":"203.0.113.42","jail":"sshd","failures":"5"}'
-    
-    # 3. firewall_issue
-    send_test "firewall_issue" \
-        '{"type":"firewall_issue","component":"firewall","severity":"warning","title":"Firewall issue detected","body":"Firewall rule conflict detected on vmbr0.\nRule 15 overlaps with rule 23, potentially blocking cluster traffic.","reason":"Firewall rule conflict detected on vmbr0. Rule 15 overlaps with rule 23."}'
-    
-    # 4. user_permission_change
-    send_test "user_permission_change" \
-        '{"type":"user_permission_change","component":"auth","severity":"info","title":"User permission changed","body":"User: admin@pam\nChange: Added PVEAdmin role on /vms/100","username":"admin@pam","change_details":"Added PVEAdmin role on /vms/100"}'
-    
-    # 5. burst_auth_fail
-    send_test "burst_auth_fail" \
-        '{"type":"burst_auth_fail","component":"security","severity":"warning","title":"8 auth failures in 2m","body":"8 authentication failures detected in 2m.\nSources: 203.0.113.42, 198.51.100.7, 192.0.2.15","count":"8","window":"2m","entity_list":"203.0.113.42, 198.51.100.7, 192.0.2.15"}'
-    
-    # 6. burst_ip_block
-    send_test "burst_ip_block" \
-        '{"type":"burst_ip_block","component":"security","severity":"info","title":"Fail2Ban banned 4 IPs in 5m","body":"4 IPs banned by Fail2Ban in 5m.\nIPs: 203.0.113.42, 198.51.100.7, 192.0.2.15, 10.0.0.99","count":"4","window":"5m","entity_list":"203.0.113.42, 198.51.100.7, 192.0.2.15, 10.0.0.99"}'
-}
-
-# ============================================================================
-# CLUSTER CATEGORY (group: cluster)
-# ============================================================================
-test_cluster() {
-    echo ""
-    echo -e "${YELLOW}========================================${NC}"
-    echo -e "${YELLOW}  CLUSTER - Quorum, split-brain, HA fencing${NC}"
-    echo -e "${YELLOW}========================================${NC}"
-    echo ""
-    
-    # 1. split_brain
-    send_test "split_brain" \
-        '{"type":"split_brain","component":"cluster","severity":"critical","title":"SPLIT-BRAIN detected","body":"Cluster split-brain condition detected.\nQuorum status: No quorum - 1/3 nodes visible","quorum":"No quorum - 1/3 nodes visible"}'
-    
-    # 2. node_disconnect
-    send_test "node_disconnect" \
-        '{"type":"node_disconnect","component":"corosync","severity":"critical","title":"Node disconnected","body":"Node pve-node3 has disconnected from the cluster.","node_name":"pve-node3"}'
-    
-    # 3. node_reconnect
-    send_test "node_reconnect" \
-        '{"type":"node_reconnect","component":"corosync","severity":"info","title":"Node reconnected","body":"Node pve-node3 has reconnected to the cluster.","node_name":"pve-node3"}'
-    
-    # 4. burst_cluster
-    send_test "burst_cluster" \
-        '{"type":"burst_cluster","component":"cluster","severity":"critical","title":"Cluster flapping detected (6 changes)","body":"Cluster state changed 6 times in 5m.\nNodes: pve-node2, pve-node3","count":"6","window":"5m","entity_list":"pve-node2, pve-node3"}'
-}
-
-# ============================================================================
-# BURST AGGREGATION TESTS (send rapid events to trigger burst detection)
-# ============================================================================
-test_burst() {
-    echo ""
-    echo -e "${YELLOW}========================================${NC}"
-    echo -e "${YELLOW}  BURST - Rapid events to trigger aggregation${NC}"
-    echo -e "${YELLOW}========================================${NC}"
-    echo ""
-    
-    echo -e "${BLUE}  Sending 5 rapid auth_fail events (should trigger burst_auth_fail)...${NC}"
-    for i in $(seq 1 5); do
-        curl -s -X POST "$API" \
-            -H "Content-Type: application/json" \
-            -d "{\"type\":\"auth_fail\",\"component\":\"auth\",\"severity\":\"warning\",\"title\":\"Auth fail from 10.0.0.$i\",\"body\":\"Failed login from 10.0.0.$i\",\"source_ip\":\"10.0.0.$i\"}" > /dev/null
-        echo -e "    ${CYAN}Sent auth_fail $i/5${NC}"
-        sleep 0.5
-    done
-    echo -e "    ${GREEN}Done. Wait ~10s for burst aggregation...${NC}"
-    sleep 10
-    
-    echo ""
-    echo -e "${BLUE}  Sending 4 rapid disk_io_error events (should trigger burst_disk_io)...${NC}"
-    for i in $(seq 1 4); do
-        curl -s -X POST "$API" \
-            -H "Content-Type: application/json" \
-            -d "{\"type\":\"disk_io_error\",\"component\":\"smart\",\"severity\":\"critical\",\"title\":\"I/O error on /dev/sd${i}\",\"body\":\"Error on device\",\"device\":\"/dev/sd${i}\"}" > /dev/null
-        echo -e "    ${CYAN}Sent disk_io_error $i/4${NC}"
-        sleep 0.5
-    done
-    echo -e "    ${GREEN}Done. Wait ~10s for burst aggregation...${NC}"
-    sleep 10
-    
-    echo ""
-    echo -e "${BLUE}  Sending 3 rapid node_disconnect events (should trigger burst_cluster)...${NC}"
-    for i in $(seq 1 3); do
-        curl -s -X POST "$API" \
-            -H "Content-Type: application/json" \
-            -d "{\"type\":\"node_disconnect\",\"component\":\"corosync\",\"severity\":\"critical\",\"title\":\"Node pve-node$i disconnected\",\"body\":\"Node lost\",\"node_name\":\"pve-node$i\"}" > /dev/null
-        echo -e "    ${CYAN}Sent node_disconnect $i/3${NC}"
-        sleep 0.5
-    done
-    echo -e "    ${GREEN}Done. Wait ~10s for burst aggregation...${NC}"
-    sleep 10
-}
-
-# ============================================================================
-# MAIN
-# ============================================================================
-
-echo ""
-echo -e "${BOLD}============================================================${NC}"
-echo -e "${BOLD}  ProxMenux Notification System - Complete Test Suite${NC}"
-echo -e "${BOLD}============================================================${NC}"
-echo -e "  API: $API"
-echo -e "  Pause: ${PAUSE}s between tests"
-echo ""
-
-# Check that the service is reachable
-status=$(curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:8008/api/notifications/status" 2>/dev/null)
-if [ "$status" != "200" ]; then
-    echo -e "${RED}ERROR: Notification service not reachable (HTTP $status)${NC}"
-    echo -e "  Make sure ProxMenux Monitor is running."
-    exit 1
-fi
-echo -e "${GREEN}Service is reachable.${NC}"
-
-# Parse argument
-category="${1:-all}"
-
-case "$category" in
-    system)     test_system ;;
-    vm_ct)      test_vm_ct ;;
-    backup)     test_backup ;;
-    resources)  test_resources ;;
-    storage)    test_storage ;;
-    network)    test_network ;;
-    security)   test_security ;;
-    cluster)    test_cluster ;;
-    burst)      test_burst ;;
-    all)
-        test_system
-        test_vm_ct
-        test_backup
-        test_resources
-        test_storage
-        test_network
-        test_security
-        test_cluster
-        test_burst
-        ;;
-    *)
-        echo -e "${RED}Unknown category: $category${NC}"
-        echo "Usage: $0 [system|vm_ct|backup|resources|storage|network|security|cluster|burst|all]"
-        exit 1
-        ;;
-esac
-
-# ============================================================================
-# SUMMARY
-# ============================================================================
-echo ""
-echo -e "${BOLD}============================================================${NC}"
-echo -e "${BOLD}  SUMMARY${NC}"
-echo -e "${BOLD}============================================================${NC}"
-echo -e "  Total tests:  $test_count"
-echo -e "  ${GREEN}Accepted:${NC}     $pass_count"
-echo -e "  ${RED}Rejected:${NC}     $fail_count"
-echo ""
-echo -e "  Check your notification channels for the messages."
-echo -e "  Note: Some events may be filtered by your current settings"
-echo -e "  (severity filter, disabled categories, disabled individual events)."
-echo ""
-echo -e "  To check notification history (all events):"
-echo -e "  ${CYAN}curl -s 'http://127.0.0.1:8008/api/notifications/history?limit=200' | python3 -m json.tool${NC}"
-echo ""
-echo -e "  To count events by type:"
-echo -e "  ${CYAN}curl -s 'http://127.0.0.1:8008/api/notifications/history?limit=200' | python3 -c \"import sys,json; h=json.load(sys.stdin)['history']; [print(f'  {t}: {c}') for t,c in sorted(dict((e['event_type'],sum(1 for x in h if x['event_type']==e['event_type'])) for e in h).items())]\"${NC}
-echo ""
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test script to simulate a disk error and verify observation recording.
-Usage: python3 test_disk_observation.py [device_name] [error_type]
-
-Examples:
-  python3 test_disk_observation.py sdh io_error
-  python3 test_disk_observation.py sdh smart_error
-  python3 test_disk_observation.py sdh fs_error
-"""
-
-import sys
-import os
-
-# Add possible module locations to path
-script_dir = os.path.dirname(os.path.abspath(__file__))
-sys.path.insert(0, script_dir)
-sys.path.insert(0, '/usr/local/share/proxmenux')
-sys.path.insert(0, '/tmp/.mount_ProxMeztyU13/usr/bin')  # AppImage mount point
-
-# Try to find the module
-for path in sys.path:
-    if os.path.exists(os.path.join(path, 'health_persistence.py')):
-        print(f"[INFO] Found health_persistence.py in: {path}")
-        break
-
-from health_persistence import HealthPersistence
-from datetime import datetime
-
-def main():
-    device_name = sys.argv[1] if len(sys.argv) > 1 else 'sdh'
-    error_type = sys.argv[2] if len(sys.argv) > 2 else 'io_error'
-    
-    # Known serial for sdh (WDC 2TB)
-    serial_map = {
-        'sdh': 'WD-WX72A30AA72R',
-        'nvme0n1': '2241E675EA6C',
-        'nvme1n1': '2241E675EBE6',
-        'sda': '22440F443504',
-        'sdb': 'WWZ1SJ18',
-        'sdc': '52X0A0D9FZ1G',
-        'sdd': '50026B7784446E63',
-        'sde': '22440F442105',
-        'sdf': 'WRQ0X2GP',
-        'sdg': '23Q0A0MPFZ1G',
-    }
-    
-    serial = serial_map.get(device_name, None)
-    
-    # Error messages by type
-    error_messages = {
-        'io_error': f'Test I/O error on /dev/{device_name}: sector read failed at LBA 12345678',
-        'smart_error': f'/dev/{device_name}: SMART warning - 1 Currently unreadable (pending) sectors detected',
-        'fs_error': f'EXT4-fs error (device {device_name}1): inode 123456: block 789012: error reading data',
-    }
-    
-    error_signatures = {
-        'io_error': f'io_test_{device_name}',
-        'smart_error': f'smart_test_{device_name}',
-        'fs_error': f'fs_test_{device_name}',
-    }
-    
-    message = error_messages.get(error_type, f'Test error on /dev/{device_name}')
-    signature = error_signatures.get(error_type, f'test_{device_name}')
-    
-    print(f"\n{'='*60}")
-    print(f"Testing Disk Observation Recording")
-    print(f"{'='*60}")
-    print(f"Device:     /dev/{device_name}")
-    print(f"Serial:     {serial or 'Unknown'}")
-    print(f"Error Type: {error_type}")
-    print(f"Message:    {message}")
-    print(f"Signature:  {signature}")
-    print(f"{'='*60}\n")
-    
-    # Initialize persistence
-    hp = HealthPersistence()
-    
-    # Record the observation
-    print("[1] Recording observation...")
-    hp.record_disk_observation(
-        device_name=device_name,
-        serial=serial,
-        error_type=error_type,
-        error_signature=signature,
-        raw_message=message,
-        severity='warning'
-    )
-    print("    OK - Observation recorded\n")
-    
-    # Query observations for this device
-    print("[2] Querying observations for this device...")
-    observations = hp.get_disk_observations(device_name=device_name, serial=serial)
-    
-    if observations:
-        print(f"    Found {len(observations)} observation(s):\n")
-        for obs in observations:
-            print(f"    ID: {obs['id']}")
-            print(f"    Type: {obs['error_type']}")
-            print(f"    Signature: {obs['error_signature']}")
-            print(f"    Message: {obs['raw_message'][:80]}...")
-            print(f"    Severity: {obs['severity']}")
-            print(f"    First: {obs['first_occurrence']}")
-            print(f"    Last: {obs['last_occurrence']}")
-            print(f"    Count: {obs['occurrence_count']}")
-            print(f"    Dismissed: {obs['dismissed']}")
-            print()
-    else:
-        print("    No observations found!\n")
-    
-    # Also show the disk registry
-    print("[3] Checking disk registry...")
-    all_devices = hp.get_all_observed_devices()
-    for dev in all_devices:
-        if dev.get('device_name') == device_name or dev.get('serial') == serial:
-            print(f"    Found in registry:")
-            print(f"    ID: {dev.get('id')}")
-            print(f"    Device: {dev.get('device_name')}")
-            print(f"    Serial: {dev.get('serial')}")
-            print(f"    First seen: {dev.get('first_seen')}")
-            print(f"    Last seen: {dev.get('last_seen')}")
-            print()
-    
-    print(f"{'='*60}")
-    print("Test complete! Check the Storage section in the UI.")
-    print(f"The disk /dev/{device_name} should now show an observations badge.")
-    print(f"{'='*60}\n")
-
-
-if __name__ == '__main__':
-    main()
@@ -1,732 +0,0 @@
-#!/bin/bash
-# ============================================================================
-# ProxMenux - Real Proxmox Event Simulator
-# ============================================================================
-# This script triggers ACTUAL events on Proxmox so that PVE's notification
-# system fires real webhooks through the full pipeline:
-#
-#   PVE event -> PVE notification -> webhook POST -> our pipeline -> Telegram
-#
-# Unlike test_all_notifications.sh (which injects directly via API), this
-# script makes Proxmox generate the events itself.
-#
-# Usage:
-#   chmod +x test_real_events.sh
-#   ./test_real_events.sh              # interactive menu
-#   ./test_real_events.sh disk         # run disk tests only
-#   ./test_real_events.sh backup       # run backup tests only
-#   ./test_real_events.sh all          # run all tests
-# ============================================================================
-
-set -euo pipefail
-
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-CYAN='\033[0;36m'
-BOLD='\033[1m'
-NC='\033[0m'
-
-API="http://127.0.0.1:8008"
-LOG_FILE="/tmp/proxmenux_real_test_$(date +%Y%m%d_%H%M%S).log"
-
-# ── Helpers ─────────────────────���───────────────────────────────
-log() { echo -e "$1" | tee -a "$LOG_FILE"; }
-header() {
-    echo "" | tee -a "$LOG_FILE"
-    echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" | tee -a "$LOG_FILE"
-    echo -e "${BOLD}  $1${NC}" | tee -a "$LOG_FILE"
-    echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" | tee -a "$LOG_FILE"
-}
-
-warn() { log "${YELLOW}  [!] $1${NC}"; }
-ok()   { log "${GREEN}  [OK] $1${NC}"; }
-fail() { log "${RED}  [FAIL] $1${NC}"; }
-info() { log "${CYAN}  [i] $1${NC}"; }
-
-confirm() {
-    echo ""
-    echo -e "${YELLOW}  $1${NC}"
-    echo -ne "  Continue? [Y/n]: "
-    read -r ans
-    [[ -z "$ans" || "$ans" =~ ^[Yy] ]]
-}
-
-wait_webhook() {
-    local seconds=${1:-10}
-    log "  Waiting ${seconds}s for webhook delivery..."
-    sleep "$seconds"
-}
-
-snapshot_history() {
-    curl -s "${API}/api/notifications/history?limit=200" 2>/dev/null | python3 -c "
-import sys, json
-try:
-    data = json.load(sys.stdin)
-    count = len(data.get('history', []))
-    print(count)
-except:
-    print(0)
-" 2>/dev/null || echo "0"
-}
-
-check_new_events() {
-    local before=$1
-    local after
-    after=$(snapshot_history)
-    local diff=$((after - before))
-    if [ "$diff" -gt 0 ]; then
-        ok "Received $diff new notification(s) via webhook"
-        # Show the latest events
-        curl -s "${API}/api/notifications/history?limit=$((diff + 2))" 2>/dev/null | python3 -c "
-import sys, json
-data = json.load(sys.stdin)
-for h in data.get('history', [])[:$diff]:
-    sev = h.get('severity', '?')
-    icon = {'CRITICAL': '  RED', 'WARNING': '  YEL', 'INFO': '  BLU'}.get(sev, '  ???')
-    print(f'{icon}  {h[\"event_type\"]:25s}  {h.get(\"title\", \"\")[:60]}')
-" 2>/dev/null | tee -a "$LOG_FILE"
-    else
-        warn "No new notifications detected (may need more time or check filters)"
-    fi
-}
-
-# ── Pre-flight checks ──────────────────────────────────────────
-preflight() {
-    header "Pre-flight Checks"
-    
-    # Check if running as root
-    if [ "$(id -u)" -ne 0 ]; then
-        fail "This script must be run as root"
-        exit 1
-    fi
-    ok "Running as root"
-    
-    # Check ProxMenux is running
-    if curl -s "${API}/api/health" >/dev/null 2>&1; then
-        ok "ProxMenux Monitor is running"
-    else
-        fail "ProxMenux Monitor not reachable at ${API}"
-        exit 1
-    fi
-    
-    # Check webhook is configured by querying PVE directly
-    if pvesh get /cluster/notifications/endpoints/webhook --output-format json 2>/dev/null | python3 -c "
-import sys, json
-endpoints = json.load(sys.stdin)
-found = any('proxmenux' in e.get('name','').lower() for e in (endpoints if isinstance(endpoints, list) else [endpoints]))
-exit(0 if found else 1)
-" 2>/dev/null; then
-        ok "PVE webhook endpoint 'proxmenux-webhook' is configured"
-    else
-        warn "PVE webhook may not be configured. Run setup from the UI first."
-        if ! confirm "Continue anyway?"; then
-            exit 1
-        fi
-    fi
-    
-    # Check notification config
-    # API returns { config: { enabled: true/false/'true'/'false', ... }, success: true }
-    if curl -s "${API}/api/notifications/settings" 2>/dev/null | python3 -c "
-import sys, json
-d = json.load(sys.stdin)
-cfg = d.get('config', d)
-enabled = cfg.get('enabled', False)
-exit(0 if enabled is True or str(enabled).lower() == 'true' else 1)
-" 2>/dev/null; then
-        ok "Notifications are enabled"
-    else
-        fail "Notifications are NOT enabled. Enable them in the UI first."
-        exit 1
-    fi
-    
-    # Re-run webhook setup to ensure priv config and body template exist
-    info "Re-configuring PVE webhook (ensures priv config + body template)..."
-    local setup_result
-    setup_result=$(curl -s -X POST "${API}/api/notifications/proxmox/setup-webhook" 2>/dev/null)
-    if echo "$setup_result" | python3 -c "import sys,json; d=json.load(sys.stdin); exit(0 if d.get('configured') else 1)" 2>/dev/null; then
-        ok "PVE webhook re-configured successfully"
-    else
-        local setup_err
-        setup_err=$(echo "$setup_result" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error','unknown'))" 2>/dev/null)
-        warn "Webhook setup returned: ${setup_err}"
-        warn "PVE webhook events may not work. Manual commands below:"
-        echo "$setup_result" | python3 -c "
-import sys, json
-d = json.load(sys.stdin)
-for cmd in d.get('fallback_commands', []):
-    print(f'  {cmd}')
-" 2>/dev/null
-        if ! confirm "Continue anyway?"; then
-            exit 1
-        fi
-    fi
-    
-    # Find a VM/CT for testing
-    VMID=""
-    VMNAME=""
-    VMTYPE=""
-    
-    # Try to find a stopped CT first (safest)
-    local cts
-    cts=$(pvesh get /cluster/resources --type vm --output-format json 2>/dev/null || echo "[]")
-    
-    # Look for a stopped container
-    VMID=$(echo "$cts" | python3 -c "
-import sys, json
-vms = json.load(sys.stdin)
-# Prefer stopped CTs, then stopped VMs
-for v in sorted(vms, key=lambda x: (0 if x.get('type')=='lxc' else 1, 0 if x.get('status')=='stopped' else 1)):
-    if v.get('status') == 'stopped':
-        print(v.get('vmid', ''))
-        break
-" 2>/dev/null || echo "")
-    
-    if [ -n "$VMID" ]; then
-        VMTYPE=$(echo "$cts" | python3 -c "
-import sys, json
-vms = json.load(sys.stdin)
-for v in vms:
-    if str(v.get('vmid')) == '$VMID':
-        print(v.get('type', 'qemu'))
-        break
-" 2>/dev/null)
-        VMNAME=$(echo "$cts" | python3 -c "
-import sys, json
-vms = json.load(sys.stdin)
-for v in vms:
-    if str(v.get('vmid')) == '$VMID':
-        print(v.get('name', 'unknown'))
-        break
-" 2>/dev/null)
-        ok "Found stopped ${VMTYPE} for testing: ${VMID} (${VMNAME})"
-    else
-        warn "No stopped VM/CT found. Backup tests will use ID 0 (host backup)."
-    fi
-    
-    # List available storage
-    info "Available storage:"
-    pvesh get /storage --output-format json 2>/dev/null | python3 -c "
-import sys, json
-stores = json.load(sys.stdin)
-for s in stores:
-    sid = s.get('storage', '?')
-    stype = s.get('type', '?')
-    content = s.get('content', '?')
-    print(f'    {sid:20s}  type={stype:10s}  content={content}')
-" 2>/dev/null | tee -a "$LOG_FILE" || warn "Could not list storage"
-    
-    echo ""
-    log "  Log file: ${LOG_FILE}"
-}
-
-# ============================================================================
-#  TEST CATEGORY: DISK ERRORS
-# ============================================================================
-test_disk() {
-    header "DISK ERROR TESTS"
-    
-    # ── Test D1: SMART error injection ──
-    log ""
-    log "${BOLD}  Test D1: SMART error log injection${NC}"
-    info "Writes a simulated SMART error to syslog so JournalWatcher catches it."
-    info "This tests the journal -> notification_events -> pipeline flow."
-    
-    local before
-    before=$(snapshot_history)
-    
-    # Inject a realistic SMART error into the system journal
-    logger -t kernel -p kern.err "ata1.00: exception Emask 0x0 SAct 0x0 SErr 0x0 action 0x6 frozen"
-    sleep 1
-    logger -t kernel -p kern.crit "ata1.00: failed command: READ FPDMA QUEUED"
-    sleep 1
-    logger -t smartd -p daemon.warning "Device: /dev/sda [SAT], 1 Currently unreadable (pending) sectors"
-    
-    wait_webhook 8
-    check_new_events "$before"
-    
-    # ── Test D2: ZFS error simulation ──
-    log ""
-    log "${BOLD}  Test D2: ZFS scrub error simulation${NC}"
-    
-    # Check if ZFS is available
-    if command -v zpool >/dev/null 2>&1; then
-        local zpools
-        zpools=$(zpool list -H -o name 2>/dev/null || echo "")
-        
-        if [ -n "$zpools" ]; then
-            local pool
-            pool=$(echo "$zpools" | head -1)
-            info "ZFS pool found: ${pool}"
-            info "Injecting ZFS checksum error into syslog (non-destructive)."
-            
-            before=$(snapshot_history)
-            
-            # Simulate ZFS error events via syslog (non-destructive)
-            logger -t kernel -p kern.warning "ZFS: pool '${pool}' has experienced an error"
-            sleep 1
-            logger -t zfs-module -p daemon.err "CHECKSUM error on ${pool}:mirror-0/sda: zio error"
-            
-            wait_webhook 8
-            check_new_events "$before"
-        else
-            warn "ZFS installed but no pools found. Skipping ZFS test."
-        fi
-    else
-        warn "ZFS not installed. Skipping ZFS test."
-    fi
-    
-    # ── Test D3: Filesystem space pressure ──
-    log ""
-    log "${BOLD}  Test D3: Disk space pressure simulation${NC}"
-    info "Creates a large temporary file to fill disk, triggering space warnings."
-    info "The Health Monitor should detect low disk space within ~60s."
-    
-    # Check current free space on /
-    local free_pct
-    free_pct=$(df / | tail -1 | awk '{print 100-$5}' | tr -d '%')
-    info "Current free space on /: ${free_pct}%"
-    
-    if [ "$free_pct" -gt 15 ]; then
-        info "Disk has ${free_pct}% free. Need to reduce below threshold for test."
-        
-        # Calculate how much to fill (leave only 8% free)
-        local total_k free_k fill_k
-        total_k=$(df / | tail -1 | awk '{print $2}')
-        free_k=$(df / | tail -1 | awk '{print $4}')
-        fill_k=$((free_k - (total_k * 8 / 100)))
-        
-        if [ "$fill_k" -gt 0 ] && [ "$fill_k" -lt 50000000 ]; then
-            info "Will create ${fill_k}KB temp file to simulate low space."
-            
-            if confirm "This will temporarily fill disk to ~92% on /. Safe to proceed?"; then
-                before=$(snapshot_history)
-                
-                dd if=/dev/zero of=/tmp/.proxmenux_disk_test bs=1024 count="$fill_k" 2>/dev/null || true
-                ok "Temp file created. Disk pressure active."
-                info "Waiting 90s for Health Monitor to detect low space..."
-                
-                # Wait for health monitor polling cycle
-                for i in $(seq 1 9); do
-                    echo -ne "\r  Waiting... ${i}0/90s"
-                    sleep 10
-                done
-                echo ""
-                
-                # Clean up immediately
-                rm -f /tmp/.proxmenux_disk_test
-                ok "Temp file removed. Disk space restored."
-                
-                check_new_events "$before"
-            else
-                warn "Skipped disk pressure test."
-            fi
-        else
-            warn "Cannot safely fill disk (would need ${fill_k}KB). Skipping."
-        fi
-    else
-        warn "Disk already at ${free_pct}% free. Health Monitor may already be alerting."
-    fi
-    
-    # ── Test D4: I/O error in syslog ──
-    log ""
-    log "${BOLD}  Test D4: Generic I/O error injection${NC}"
-    info "Injects I/O errors into syslog for JournalWatcher."
-    
-    before=$(snapshot_history)
-    
-    logger -t kernel -p kern.err "Buffer I/O error on dev sdb1, logical block 0, async page read"
-    sleep 1
-    logger -t kernel -p kern.err "EXT4-fs error (device sdb1): ext4_find_entry:1455: inode #2: comm ls: reading directory lblock 0"
-    
-    wait_webhook 8
-    check_new_events "$before"
-}
-
-# ============================================================================
-#  TEST CATEGORY: BACKUP EVENTS
-# ============================================================================
-test_backup() {
-    header "BACKUP EVENT TESTS"
-    
-    local backup_storage=""
-    
-    # Find backup-capable storage
-    backup_storage=$(pvesh get /storage --output-format json 2>/dev/null | python3 -c "
-import sys, json
-stores = json.load(sys.stdin)
-for s in stores:
-    content = s.get('content', '')
-    if 'backup' in content or 'vztmpl' in content:
-        print(s.get('storage', ''))
-        break
-# Fallback: try 'local'
-else:
-    for s in stores:
-        if s.get('storage') == 'local':
-            print('local')
-            break
-" 2>/dev/null || echo "local")
-    
-    info "Using backup storage: ${backup_storage}"
-    
-    # ── Test B1: Successful vzdump backup ──
-    if [ -n "$VMID" ]; then
-        log ""
-        log "${BOLD}  Test B1: Real vzdump backup (success)${NC}"
-        info "Running a real vzdump backup of ${VMTYPE} ${VMID} (${VMNAME})."
-        info "This triggers PVE's notification system with a real backup event."
-        
-        if confirm "This will backup ${VMTYPE} ${VMID} to '${backup_storage}'. Proceed?"; then
-            local before
-            before=$(snapshot_history)
-            
-            # Use snapshot mode for VMs (non-disruptive), stop mode for CTs
-            local bmode="snapshot"
-            if [ "$VMTYPE" = "lxc" ]; then
-                bmode="suspend"
-            fi
-            
-            info "Starting vzdump (mode=${bmode}, compress=zstd)..."
-            if vzdump "$VMID" --storage "$backup_storage" --mode "$bmode" --compress zstd --notes-template "ProxMenux test backup" 2>&1 | tee -a "$LOG_FILE"; then
-                ok "vzdump completed successfully!"
-            else
-                warn "vzdump returned non-zero (check output above)"
-            fi
-            
-            wait_webhook 12
-            check_new_events "$before"
-            
-            # Clean up the test backup
-            info "Cleaning up test backup file..."
-            local latest_bak
-            latest_bak=$(find "/var/lib/vz/dump/" -name "vzdump-*-${VMID}-*" -type f -newer /tmp/.proxmenux_bak_marker 2>/dev/null | head -1 || echo "")
-            # Create a marker for cleanup
-            touch /tmp/.proxmenux_bak_marker 2>/dev/null || true
-        else
-            warn "Skipped backup success test."
-        fi
-        
-        # ── Test B2: Failed vzdump backup ──
-        log ""
-        log "${BOLD}  Test B2: vzdump backup failure (invalid storage)${NC}"
-        info "Attempting backup to non-existent storage to trigger a backup failure event."
-        
-        before=$(snapshot_history)
-        
-        # This WILL fail because the storage doesn't exist
-        info "Starting vzdump to fake storage (will fail intentionally)..."
-        vzdump "$VMID" --storage "nonexistent_storage_12345" --mode snapshot 2>&1 | tail -5 | tee -a "$LOG_FILE" || true
-        
-        warn "vzdump failed as expected (this is intentional)."
-        
-        wait_webhook 12
-        check_new_events "$before"
-        
-    else
-        warn "No VM/CT available for backup tests."
-        info "You can create a minimal LXC container for testing:"
-        info "  pct create 9999 local:vztmpl/debian-12-standard_12.2-1_amd64.tar.zst --storage local-lvm --memory 128 --cores 1"
-    fi
-    
-    # ── Test B3: Snapshot create/delete ──
-    if [ -n "$VMID" ] && [ "$VMTYPE" = "qemu" ]; then
-        log ""
-        log "${BOLD}  Test B3: VM Snapshot create & delete${NC}"
-        info "Creating a snapshot of VM ${VMID} to test snapshot events."
-        
-        if confirm "Create snapshot 'proxmenux_test' on VM ${VMID}?"; then
-            local before
-            before=$(snapshot_history)
-            
-            if qm snapshot "$VMID" proxmenux_test --description "ProxMenux test snapshot" 2>&1 | tee -a "$LOG_FILE"; then
-                ok "Snapshot created!"
-            else
-                warn "Snapshot creation returned non-zero"
-            fi
-            
-            wait_webhook 10
-            check_new_events "$before"
-            
-            # Clean up snapshot
-            info "Cleaning up test snapshot..."
-            qm delsnapshot "$VMID" proxmenux_test 2>/dev/null || true
-            ok "Snapshot removed."
-        fi
-    elif [ -n "$VMID" ] && [ "$VMTYPE" = "lxc" ]; then
-        log ""
-        log "${BOLD}  Test B3: CT Snapshot create & delete${NC}"
-        info "Creating a snapshot of CT ${VMID}."
-        
-        if confirm "Create snapshot 'proxmenux_test' on CT ${VMID}?"; then
-            local before
-            before=$(snapshot_history)
-            
-            if pct snapshot "$VMID" proxmenux_test --description "ProxMenux test snapshot" 2>&1 | tee -a "$LOG_FILE"; then
-                ok "Snapshot created!"
-            else
-                warn "Snapshot creation returned non-zero"
-            fi
-            
-            wait_webhook 10
-            check_new_events "$before"
-            
-            # Clean up
-            info "Cleaning up test snapshot..."
-            pct delsnapshot "$VMID" proxmenux_test 2>/dev/null || true
-            ok "Snapshot removed."
-        fi
-    fi
-    
-    # ── Test B4: PVE scheduled backup notification ──
-    log ""
-    log "${BOLD}  Test B4: Trigger PVE notification system directly${NC}"
-    info "Using 'pvesh create /notifications/endpoints/...' to test PVE's own system."
-    info "This sends a test notification through PVE, which should hit our webhook."
-    
-    local before
-    before=$(snapshot_history)
-    
-    # PVE 8.x has a test endpoint for notifications
-    if pvesh create /notifications/targets/test --target proxmenux-webhook 2>&1 | tee -a "$LOG_FILE"; then
-        ok "PVE test notification sent!"
-    else
-        # Try alternative method
-        info "Direct test not available. Trying via API..."
-        pvesh set /notifications/endpoints/webhook/proxmenux-webhook --test 1 2>/dev/null || \
-            warn "Could not send PVE test notification (requires PVE 8.1+)"
-    fi
-    
-    wait_webhook 8
-    check_new_events "$before"
-}
-
-# ============================================================================
-#  TEST CATEGORY: VM/CT LIFECYCLE
-# ============================================================================
-test_vmct() {
-    header "VM/CT LIFECYCLE TESTS"
-    
-    if [ -z "$VMID" ]; then
-        warn "No stopped VM/CT found for lifecycle tests."
-        info "Create a minimal CT: pct create 9999 local:vztmpl/debian-12-standard_12.2-1_amd64.tar.zst --storage local-lvm --memory 128 --cores 1"
-        return
-    fi
-    
-    log ""
-    log "${BOLD}  Test V1: Start ${VMTYPE} ${VMID} (${VMNAME})${NC}"
-    
-    if confirm "Start ${VMTYPE} ${VMID}? It will be stopped again after the test."; then
-        local before
-        before=$(snapshot_history)
-        
-        if [ "$VMTYPE" = "lxc" ]; then
-            pct start "$VMID" 2>&1 | tee -a "$LOG_FILE" || true
-        else
-            qm start "$VMID" 2>&1 | tee -a "$LOG_FILE" || true
-        fi
-        
-        ok "Start command sent."
-        wait_webhook 10
-        check_new_events "$before"
-        
-        # Wait a moment
-        sleep 5
-        
-        # ── Test V2: Stop ──
-        log ""
-        log "${BOLD}  Test V2: Stop ${VMTYPE} ${VMID}${NC}"
-        
-        before=$(snapshot_history)
-        
-        if [ "$VMTYPE" = "lxc" ]; then
-            pct stop "$VMID" 2>&1 | tee -a "$LOG_FILE" || true
-        else
-            qm stop "$VMID" 2>&1 | tee -a "$LOG_FILE" || true
-        fi
-        
-        ok "Stop command sent."
-        wait_webhook 10
-        check_new_events "$before"
-    fi
-}
-
-# ============================================================================
-#  TEST CATEGORY: SYSTEM EVENTS (via syslog injection)
-# ============================================================================
-test_system() {
-    header "SYSTEM EVENT TESTS (syslog injection)"
-    
-    # ── Test S1: Authentication failures ──
-    log ""
-    log "${BOLD}  Test S1: SSH auth failure injection${NC}"
-    info "Injecting SSH auth failure messages into syslog."
-    
-    local before
-    before=$(snapshot_history)
-    
-    logger -t sshd -p auth.warning "Failed password for root from 192.168.1.200 port 44312 ssh2"
-    sleep 2
-    logger -t sshd -p auth.warning "Failed password for invalid user admin from 10.0.0.50 port 55123 ssh2"
-    sleep 2
-    logger -t sshd -p auth.warning "Failed password for root from 192.168.1.200 port 44315 ssh2"
-    
-    wait_webhook 8
-    check_new_events "$before"
-    
-    # ── Test S2: Firewall event ──
-    log ""
-    log "${BOLD}  Test S2: Firewall drop event${NC}"
-    
-    before=$(snapshot_history)
-    
-    logger -t kernel -p kern.warning "pve-fw-reject: IN=vmbr0 OUT= MAC=00:11:22:33:44:55 SRC=10.0.0.99 DST=192.168.1.1 PROTO=TCP DPT=22 REJECT"
-    sleep 2
-    logger -t pvefw -p daemon.warning "firewall: blocked incoming connection from 10.0.0.99:45678 to 192.168.1.1:8006"
-    
-    wait_webhook 8
-    check_new_events "$before"
-    
-    # ── Test S3: Service failure ──
-    log ""
-    log "${BOLD}  Test S3: Service failure injection${NC}"
-    
-    before=$(snapshot_history)
-    
-    logger -t systemd -p daemon.err "pvedaemon.service: Main process exited, code=exited, status=1/FAILURE"
-    sleep 1
-    logger -t systemd -p daemon.err "Failed to start Proxmox VE API Daemon."
-    
-    wait_webhook 8
-    check_new_events "$before"
-}
-
-# ============================================================================
-#  SUMMARY & REPORT
-# ============================================================================
-show_summary() {
-    header "TEST SUMMARY"
-    
-    info "Fetching full notification history..."
-    echo ""
-    
-    curl -s "${API}/api/notifications/history?limit=200" 2>/dev/null | python3 -c "
-import sys, json
-from collections import Counter
-
-data = json.load(sys.stdin)
-history = data.get('history', [])
-
-if not history:
-    print('  No notifications in history.')
-    sys.exit(0)
-
-# Group by event_type
-by_type = Counter(h['event_type'] for h in history)
-# Group by severity
-by_sev = Counter(h.get('severity', '?') for h in history)
-# Group by source
-by_src = Counter(h.get('source', '?') for h in history)
-
-print(f'  Total notifications: {len(history)}')
-print()
-
-sev_icons = {'CRITICAL': '\033[0;31mCRITICAL\033[0m', 'WARNING': '\033[1;33mWARNING\033[0m', 'INFO': '\033[0;36mINFO\033[0m'}
-print('  By severity:')
-for sev, count in by_sev.most_common():
-    icon = sev_icons.get(sev, sev)
-    print(f'    {icon}: {count}')
-
-print()
-print('  By source:')
-for src, count in by_src.most_common():
-    print(f'    {src:20s}: {count}')
-
-print()
-print('  By event type:')
-for etype, count in by_type.most_common():
-    print(f'    {etype:30s}: {count}')
-
-print()
-print('  Latest 15 events:')
-for h in history[:15]:
-    sev = h.get('severity', '?')
-    icon = {'CRITICAL': '  \033[0;31mRED\033[0m', 'WARNING': '  \033[1;33mYEL\033[0m', 'INFO': '  \033[0;36mBLU\033[0m'}.get(sev, '  ???')
-    ts = h.get('sent_at', '?')[:19]
-    src = h.get('source', '?')[:12]
-    print(f'    {icon}  {ts}  {src:12s}  {h[\"event_type\"]:25s}  {h.get(\"title\", \"\")[:50]}')
-" 2>/dev/null | tee -a "$LOG_FILE"
-    
-    echo ""
-    info "Full log saved to: ${LOG_FILE}"
-    echo ""
-    info "To see all history:"
-    echo -e "  ${CYAN}curl -s '${API}/api/notifications/history?limit=200' | python3 -m json.tool${NC}"
-    echo ""
-    info "To check Telegram delivery, look at your Telegram bot chat."
-}
-
-# ============================================================================
-#  INTERACTIVE MENU
-# ============================================================================
-show_menu() {
-    echo ""
-    echo -e "${BOLD}  ProxMenux Real Event Test Suite${NC}"
-    echo ""
-    echo -e "  ${CYAN}1)${NC} Disk error tests      (SMART, ZFS, I/O, space pressure)"
-    echo -e "  ${CYAN}2)${NC} Backup tests           (vzdump success/fail, snapshots)"
-    echo -e "  ${CYAN}3)${NC} VM/CT lifecycle tests   (start/stop real VMs)"
-    echo -e "  ${CYAN}4)${NC} System event tests      (auth, firewall, service failures)"
-    echo -e "  ${CYAN}5)${NC} Run ALL tests"
-    echo -e "  ${CYAN}6)${NC} Show summary report"
-    echo -e "  ${CYAN}q)${NC} Exit"
-    echo ""
-    echo -ne "  Select: "
-}
-
-# ── Main ────────────────────────────────────────────────────────
-main() {
-    local mode="${1:-menu}"
-    
-    echo ""
-    echo -e "${BOLD}============================================================${NC}"
-    echo -e "${BOLD}  ProxMenux - Real Proxmox Event Simulator${NC}"
-    echo -e "${BOLD}============================================================${NC}"
-    echo -e "  Tests REAL events through the full PVE -> webhook pipeline."
-    echo -e "  Log file: ${CYAN}${LOG_FILE}${NC}"
-    echo ""
-    
-    preflight
-    
-    case "$mode" in
-        disk)    test_disk; show_summary ;;
-        backup)  test_backup; show_summary ;;
-        vmct)    test_vmct; show_summary ;;
-        system)  test_system; show_summary ;;
-        all)
-            test_disk
-            test_backup
-            test_vmct
-            test_system
-            show_summary
-            ;;
-        menu|*)
-            while true; do
-                show_menu
-                read -r choice
-                case "$choice" in
-                    1) test_disk ;;
-                    2) test_backup ;;
-                    3) test_vmct ;;
-                    4) test_system ;;
-                    5) test_disk; test_backup; test_vmct; test_system; show_summary; break ;;
-                    6) show_summary ;;
-                    q|Q) echo "  Bye!"; break ;;
-                    *) warn "Invalid option" ;;
-                esac
-            done
-            ;;
-    esac
-}
-
-main "${1:-menu}"