Update notification service

This commit is contained in:
MacRimi
2026-03-25 20:12:08 +01:00
parent d53c1dc402
commit 68872d0e06
8 changed files with 6 additions and 1352 deletions

View File

@@ -99,8 +99,6 @@ cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null ||
cp "$SCRIPT_DIR/oci_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ oci_manager.py not found"
cp "$SCRIPT_DIR/flask_oci_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_oci_routes.py not found"
cp "$SCRIPT_DIR/oci/description_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ description_templates.py not found"
cp "$SCRIPT_DIR/shutdown-notify.sh" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ shutdown-notify.sh not found"
chmod +x "$APP_DIR/usr/bin/shutdown-notify.sh" 2>/dev/null || true
# Copy AI providers module for notification enhancement
echo "📋 Copying AI providers module..."

View File

@@ -1325,7 +1325,7 @@ class HealthPersistence:
print(f"[HealthPersistence] Error recording UNKNOWN persistent: {e}")
# ───────────────────────────────────────────────────────────────<EFBFBD><EFBFBD>
# ───────────────────────────────────────────────────────────────
# Disk Observations API
# ────────────────────────────────────────────────────────────────

View File

@@ -37,7 +37,7 @@ class _SharedState:
Two separate grace periods:
- startup_vm_grace: Time to aggregate VM/CT starts (shorter, 2 min)
- startup_health_grace: Time to suppress transient health errors (longer, 3 min)
- startup_health_grace: Time to suppress transient health errors (longer, 5 min)
"""
def __init__(self):
self._lock = threading.Lock()
@@ -45,7 +45,7 @@ class _SharedState:
self._shutdown_grace = 120 # suppress VM/CT stops for 2 minutes after shutdown detected
self._startup_time: float = time.time() # when module was loaded (service start)
self._startup_vm_grace = 120 # aggregate VM/CT starts for 2 minutes after startup
self._startup_health_grace = 180 # suppress health warnings for 3 minutes after startup
self._startup_health_grace = 300 # suppress health warnings for 5 minutes after startup
self._startup_vms: list = [] # [(vmid, vmname, 'vm'|'ct'), ...]
self._startup_aggregated = False # have we already sent the aggregated message?
@@ -67,10 +67,10 @@ class _SharedState:
return (time.time() - self._startup_time) < self._startup_vm_grace
def is_startup_health_grace(self) -> bool:
"""Check if we're within the startup health grace period (3 min).
"""Check if we're within the startup health grace period (5 min).
Used by PollingCollector to suppress transient health warnings
(QMP timeout, storage not ready, etc.) during system boot.
(QMP timeout, storage not ready, high latency, etc.) during system boot.
"""
with self._lock:
return (time.time() - self._startup_time) < self._startup_health_grace

View File

@@ -1064,7 +1064,7 @@ def get_default_enabled_events() -> Dict[str, bool]:
}
# ── Emoji Enrichment (per-channel opt-in) ──────────────────────
# ──<EFBFBD><EFBFBD> Emoji Enrichment (per-channel opt-in) ──────────────────────
# Category-level header icons
CATEGORY_EMOJI = {

View File

@@ -1,481 +0,0 @@
#!/bin/bash
# ============================================================================
# ProxMenux Notification System - Complete Test Suite
# ============================================================================
#
# Usage:
# chmod +x test_all_notifications.sh
# ./test_all_notifications.sh # Run ALL tests (with 3s pause between)
# ./test_all_notifications.sh system # Run only System category
# ./test_all_notifications.sh vm_ct # Run only VM/CT category
# ./test_all_notifications.sh backup # Run only Backup category
# ./test_all_notifications.sh resources # Run only Resources category
# ./test_all_notifications.sh storage # Run only Storage category
# ./test_all_notifications.sh network # Run only Network category
# ./test_all_notifications.sh security # Run only Security category
# ./test_all_notifications.sh cluster # Run only Cluster category
# ./test_all_notifications.sh burst # Run only Burst aggregation tests
#
# Each test sends a simulated webhook to the local notification endpoint.
# Check your Telegram/Gotify/Discord/Email for the notifications.
# ============================================================================
API="http://127.0.0.1:8008/api/notifications/webhook"
PAUSE=3 # seconds between tests
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
BOLD='\033[1m'
test_count=0
pass_count=0
fail_count=0
send_test() {
local name="$1"
local payload="$2"
test_count=$((test_count + 1))
echo -e "${CYAN} [$test_count] ${BOLD}$name${NC}"
response=$(curl -s -w "\n%{http_code}" -X POST "$API" \
-H "Content-Type: application/json" \
-d "$payload" 2>&1)
http_code=$(echo "$response" | tail -1)
body=$(echo "$response" | head -n -1)
if [ "$http_code" = "200" ] || [ "$http_code" = "202" ]; then
echo -e " ${GREEN}HTTP $http_code${NC} - $body"
pass_count=$((pass_count + 1))
else
echo -e " ${RED}HTTP $http_code${NC} - $body"
fail_count=$((fail_count + 1))
fi
sleep "$PAUSE"
}
# ============================================================================
# SYSTEM CATEGORY (group: system)
# ============================================================================
test_system() {
echo ""
echo -e "${YELLOW}========================================${NC}"
echo -e "${YELLOW} SYSTEM - Startup, shutdown, kernel${NC}"
echo -e "${YELLOW}========================================${NC}"
echo ""
# 1. state_change (disabled by default -- test to verify it does NOT arrive)
send_test "state_change (should NOT arrive - disabled by default)" \
'{"type":"state_change","component":"health","severity":"warning","title":"overall changed to WARNING","body":"overall status changed from OK to WARNING."}'
# 2. new_error
send_test "new_error" \
'{"type":"new_error","component":"health","severity":"warning","title":"New WARNING - cpu","body":"CPU usage exceeds 90% for more than 5 minutes","category":"cpu"}'
# 3. error_resolved
send_test "error_resolved" \
'{"type":"error_resolved","component":"health","severity":"info","title":"Resolved - cpu","body":"CPU usage returned to normal.\nDuration: 15 minutes","category":"cpu","duration":"15 minutes"}'
# 4. error_escalated
send_test "error_escalated" \
'{"type":"error_escalated","component":"health","severity":"critical","title":"Escalated to CRITICAL - memory","body":"Memory usage exceeded 95% and swap is active","category":"memory"}'
# 5. system_shutdown
send_test "system_shutdown" \
'{"type":"system_shutdown","component":"system","severity":"warning","title":"System shutting down","body":"The system is shutting down.\nUser initiated shutdown."}'
# 6. system_reboot
send_test "system_reboot" \
'{"type":"system_reboot","component":"system","severity":"warning","title":"System rebooting","body":"The system is rebooting.\nKernel update applied."}'
# 7. system_problem
send_test "system_problem" \
'{"type":"system_problem","component":"system","severity":"critical","title":"System problem detected","body":"Kernel panic: Attempted to kill init! exitcode=0x00000009"}'
# 8. service_fail
send_test "service_fail" \
'{"type":"service_fail","component":"systemd","severity":"warning","title":"Service failed - pvedaemon","body":"Service pvedaemon has failed.\nUnit pvedaemon.service entered failed state.","service_name":"pvedaemon"}'
# 9. update_available (legacy, superseded by update_summary)
send_test "update_available" \
'{"type":"update_available","component":"apt","severity":"info","title":"Updates available","body":"Total updates: 12\nSecurity: 3\nProxmox: 5\nKernel: 1\nImportant: pve-manager (8.3.5 -> 8.4.1)","total_count":"12","security_count":"3","pve_count":"5","kernel_count":"1","important_list":"pve-manager (8.3.5 -> 8.4.1)"}'
# 10. update_complete
send_test "update_complete" \
'{"type":"update_complete","component":"apt","severity":"info","title":"Update completed","body":"12 packages updated successfully."}'
# 11. unknown_persistent
send_test "unknown_persistent" \
'{"type":"unknown_persistent","component":"health","severity":"warning","title":"Check unavailable - temperature","body":"Health check for temperature has been unavailable for 3+ cycles.\nSensor not responding.","category":"temperature"}'
# 12. health_persistent
send_test "health_persistent" \
'{"type":"health_persistent","component":"health","severity":"warning","title":"3 active health issue(s)","body":"The following health issues remain active:\n- CPU at 92%\n- Memory at 88%\n- Disk /dev/sda at 94%\n\nThis digest is sent once every 24 hours while issues persist.","count":"3"}'
# 13. health_issue_new
send_test "health_issue_new" \
'{"type":"health_issue_new","component":"health","severity":"warning","title":"New health issue - disk","body":"New WARNING issue detected:\nDisk /dev/sda usage at 94%","category":"disk"}'
# 14. health_issue_resolved
send_test "health_issue_resolved" \
'{"type":"health_issue_resolved","component":"health","severity":"info","title":"Resolved - disk","body":"disk issue has been resolved.\nDisk usage dropped to 72%.\nDuration: 3 hours","category":"disk","duration":"3 hours"}'
# 15. update_summary
send_test "update_summary" \
'{"type":"update_summary","component":"apt","severity":"info","title":"Updates available","body":"Total updates: 70\nSecurity updates: 9\nProxmox-related updates: 24\nKernel updates: 1\nImportant packages: pve-manager (8.3.5 -> 8.4.1), proxmox-ve (8.3.0 -> 8.4.0), qemu-server (8.3.8 -> 8.4.2)","total_count":"70","security_count":"9","pve_count":"24","kernel_count":"1","important_list":"pve-manager (8.3.5 -> 8.4.1), proxmox-ve (8.3.0 -> 8.4.0), qemu-server (8.3.8 -> 8.4.2)"}'
# 16. pve_update
send_test "pve_update" \
'{"type":"pve_update","component":"apt","severity":"info","title":"Proxmox VE 8.4.1 available","body":"Proxmox VE 8.3.5 -> 8.4.1\npve-manager 8.3.5 -> 8.4.1","current_version":"8.3.5","new_version":"8.4.1","version":"8.4.1","details":"pve-manager 8.3.5 -> 8.4.1"}'
}
# ============================================================================
# VM / CT CATEGORY (group: vm_ct)
# ============================================================================
test_vm_ct() {
echo ""
echo -e "${YELLOW}========================================${NC}"
echo -e "${YELLOW} VM / CT - Start, stop, crash, migration${NC}"
echo -e "${YELLOW}========================================${NC}"
echo ""
# 1. vm_start
send_test "vm_start" \
'{"type":"vm_start","component":"qemu","severity":"info","title":"VM 100 started","body":"ubuntu-server (100) has been started.","vmid":"100","vmname":"ubuntu-server"}'
# 2. vm_stop
send_test "vm_stop" \
'{"type":"vm_stop","component":"qemu","severity":"info","title":"VM 100 stopped","body":"ubuntu-server (100) has been stopped.","vmid":"100","vmname":"ubuntu-server"}'
# 3. vm_shutdown
send_test "vm_shutdown" \
'{"type":"vm_shutdown","component":"qemu","severity":"info","title":"VM 100 shutdown","body":"ubuntu-server (100) has been shut down.","vmid":"100","vmname":"ubuntu-server"}'
# 4. vm_fail
send_test "vm_fail" \
'{"type":"vm_fail","component":"qemu","severity":"critical","title":"VM 100 FAILED","body":"ubuntu-server (100) has failed.\nKVM: internal error: unexpected exit to hypervisor","vmid":"100","vmname":"ubuntu-server","reason":"KVM: internal error: unexpected exit to hypervisor"}'
# 5. vm_restart
send_test "vm_restart" \
'{"type":"vm_restart","component":"qemu","severity":"info","title":"VM 100 restarted","body":"ubuntu-server (100) has been restarted.","vmid":"100","vmname":"ubuntu-server"}'
# 6. ct_start
send_test "ct_start" \
'{"type":"ct_start","component":"lxc","severity":"info","title":"CT 200 started","body":"nginx-proxy (200) has been started.","vmid":"200","vmname":"nginx-proxy"}'
# 7. ct_stop
send_test "ct_stop" \
'{"type":"ct_stop","component":"lxc","severity":"info","title":"CT 200 stopped","body":"nginx-proxy (200) has been stopped.","vmid":"200","vmname":"nginx-proxy"}'
# 8. ct_fail
send_test "ct_fail" \
'{"type":"ct_fail","component":"lxc","severity":"critical","title":"CT 200 FAILED","body":"nginx-proxy (200) has failed.\nContainer exited with error code 137","vmid":"200","vmname":"nginx-proxy","reason":"Container exited with error code 137"}'
# 9. migration_start
send_test "migration_start" \
'{"type":"migration_start","component":"qemu","severity":"info","title":"Migration started - 100","body":"ubuntu-server (100) migration to pve-node2 started.","vmid":"100","vmname":"ubuntu-server","target_node":"pve-node2"}'
# 10. migration_complete
send_test "migration_complete" \
'{"type":"migration_complete","component":"qemu","severity":"info","title":"Migration complete - 100","body":"ubuntu-server (100) migrated successfully to pve-node2.","vmid":"100","vmname":"ubuntu-server","target_node":"pve-node2"}'
# 11. migration_fail
send_test "migration_fail" \
'{"type":"migration_fail","component":"qemu","severity":"critical","title":"Migration FAILED - 100","body":"ubuntu-server (100) migration to pve-node2 failed.\nNetwork timeout during memory transfer","vmid":"100","vmname":"ubuntu-server","target_node":"pve-node2","reason":"Network timeout during memory transfer"}'
# 12. replication_fail
send_test "replication_fail" \
'{"type":"replication_fail","component":"replication","severity":"critical","title":"Replication FAILED - 100","body":"Replication of ubuntu-server (100) has failed.\nTarget storage unreachable","vmid":"100","vmname":"ubuntu-server","reason":"Target storage unreachable"}'
# 13. replication_complete
send_test "replication_complete" \
'{"type":"replication_complete","component":"replication","severity":"info","title":"Replication complete - 100","body":"Replication of ubuntu-server (100) completed successfully.","vmid":"100","vmname":"ubuntu-server"}'
}
# ============================================================================
# BACKUP CATEGORY (group: backup)
# ============================================================================
test_backup() {
echo ""
echo -e "${YELLOW}========================================${NC}"
echo -e "${YELLOW} BACKUPS - Backup start, complete, fail${NC}"
echo -e "${YELLOW}========================================${NC}"
echo ""
# 1. backup_start
send_test "backup_start" \
'{"type":"backup_start","component":"vzdump","severity":"info","title":"Backup started - 100","body":"Backup of ubuntu-server (100) has started.","vmid":"100","vmname":"ubuntu-server"}'
# 2. backup_complete
send_test "backup_complete" \
'{"type":"backup_complete","component":"vzdump","severity":"info","title":"Backup complete - 100","body":"Backup of ubuntu-server (100) completed successfully.\nSize: 12.4 GB","vmid":"100","vmname":"ubuntu-server","size":"12.4 GB"}'
# 3. backup_fail
send_test "backup_fail" \
'{"type":"backup_fail","component":"vzdump","severity":"critical","title":"Backup FAILED - 100","body":"Backup of ubuntu-server (100) has failed.\nStorage local-lvm is full","vmid":"100","vmname":"ubuntu-server","reason":"Storage local-lvm is full"}'
# 4. snapshot_complete
send_test "snapshot_complete" \
'{"type":"snapshot_complete","component":"qemu","severity":"info","title":"Snapshot created - 100","body":"Snapshot of ubuntu-server (100) created: pre-upgrade-2026","vmid":"100","vmname":"ubuntu-server","snapshot_name":"pre-upgrade-2026"}'
# 5. snapshot_fail
send_test "snapshot_fail" \
'{"type":"snapshot_fail","component":"qemu","severity":"critical","title":"Snapshot FAILED - 100","body":"Snapshot of ubuntu-server (100) failed.\nInsufficient space on storage","vmid":"100","vmname":"ubuntu-server","reason":"Insufficient space on storage"}'
}
# ============================================================================
# RESOURCES CATEGORY (group: resources)
# ============================================================================
test_resources() {
echo ""
echo -e "${YELLOW}========================================${NC}"
echo -e "${YELLOW} RESOURCES - CPU, memory, temperature${NC}"
echo -e "${YELLOW}========================================${NC}"
echo ""
# 1. cpu_high
send_test "cpu_high" \
'{"type":"cpu_high","component":"health","severity":"warning","title":"High CPU usage (94%)","body":"CPU usage is at 94% on 16 cores.\nTop process: kvm (VM 100)","value":"94","cores":"16","details":"Top process: kvm (VM 100)"}'
# 2. ram_high
send_test "ram_high" \
'{"type":"ram_high","component":"health","severity":"warning","title":"High memory usage (91%)","body":"Memory usage: 58.2 GB / 64 GB (91%).\n4 VMs running, swap at 2.1 GB","value":"91","used":"58.2 GB","total":"64 GB","details":"4 VMs running, swap at 2.1 GB"}'
# 3. temp_high
send_test "temp_high" \
'{"type":"temp_high","component":"health","severity":"critical","title":"High temperature (89C)","body":"CPU temperature: 89C (threshold: 80C).\nCheck cooling system immediately","value":"89","threshold":"80","details":"Check cooling system immediately"}'
# 4. load_high
send_test "load_high" \
'{"type":"load_high","component":"health","severity":"warning","title":"High system load (24.5)","body":"System load average: 24.5 on 16 cores.\nI/O wait: 35%","value":"24.5","cores":"16","details":"I/O wait: 35%"}'
}
# ============================================================================
# STORAGE CATEGORY (group: storage)
# ============================================================================
test_storage() {
echo ""
echo -e "${YELLOW}========================================${NC}"
echo -e "${YELLOW} STORAGE - Disk space, I/O errors, SMART${NC}"
echo -e "${YELLOW}========================================${NC}"
echo ""
# 1. disk_space_low
send_test "disk_space_low" \
'{"type":"disk_space_low","component":"storage","severity":"warning","title":"Low disk space on /var","body":"/var: 93% used (4.2 GB available).","mount":"/var","used":"93","available":"4.2 GB"}'
# 2. disk_io_error
send_test "disk_io_error" \
'{"type":"disk_io_error","component":"smart","severity":"critical","title":"Disk I/O error","body":"I/O error detected on /dev/sdb.\nSMART error: Current Pending Sector Count = 8","device":"/dev/sdb","reason":"SMART error: Current Pending Sector Count = 8"}'
# 3. burst_disk_io
send_test "burst_disk_io" \
'{"type":"burst_disk_io","component":"storage","severity":"critical","title":"5 disk I/O errors on /dev/sdb, /dev/sdc","body":"5 I/O errors detected in 60s.\nDevices: /dev/sdb, /dev/sdc","count":"5","window":"60s","entity_list":"/dev/sdb, /dev/sdc"}'
}
# ============================================================================
# NETWORK CATEGORY (group: network)
# ============================================================================
test_network() {
echo ""
echo -e "${YELLOW}========================================${NC}"
echo -e "${YELLOW} NETWORK - Connectivity, bond, latency${NC}"
echo -e "${YELLOW}========================================${NC}"
echo ""
# 1. network_down
send_test "network_down" \
'{"type":"network_down","component":"network","severity":"critical","title":"Network connectivity lost","body":"Network connectivity check failed.\nGateway 192.168.1.1 unreachable. Bond vmbr0 degraded.","reason":"Gateway 192.168.1.1 unreachable. Bond vmbr0 degraded."}'
# 2. network_latency
send_test "network_latency" \
'{"type":"network_latency","component":"network","severity":"warning","title":"High network latency (450ms)","body":"Latency to gateway: 450ms (threshold: 100ms).","value":"450","threshold":"100"}'
}
# ============================================================================
# SECURITY CATEGORY (group: security)
# ============================================================================
test_security() {
echo ""
echo -e "${YELLOW}========================================${NC}"
echo -e "${YELLOW} SECURITY - Auth failures, fail2ban, firewall${NC}"
echo -e "${YELLOW}========================================${NC}"
echo ""
# 1. auth_fail
send_test "auth_fail" \
'{"type":"auth_fail","component":"auth","severity":"warning","title":"Authentication failure","body":"Failed login attempt from 203.0.113.42.\nUser: root\nService: sshd","source_ip":"203.0.113.42","username":"root","service":"sshd"}'
# 2. ip_block
send_test "ip_block" \
'{"type":"ip_block","component":"security","severity":"info","title":"IP blocked by Fail2Ban","body":"IP 203.0.113.42 has been banned.\nJail: sshd\nFailures: 5","source_ip":"203.0.113.42","jail":"sshd","failures":"5"}'
# 3. firewall_issue
send_test "firewall_issue" \
'{"type":"firewall_issue","component":"firewall","severity":"warning","title":"Firewall issue detected","body":"Firewall rule conflict detected on vmbr0.\nRule 15 overlaps with rule 23, potentially blocking cluster traffic.","reason":"Firewall rule conflict detected on vmbr0. Rule 15 overlaps with rule 23."}'
# 4. user_permission_change
send_test "user_permission_change" \
'{"type":"user_permission_change","component":"auth","severity":"info","title":"User permission changed","body":"User: admin@pam\nChange: Added PVEAdmin role on /vms/100","username":"admin@pam","change_details":"Added PVEAdmin role on /vms/100"}'
# 5. burst_auth_fail
send_test "burst_auth_fail" \
'{"type":"burst_auth_fail","component":"security","severity":"warning","title":"8 auth failures in 2m","body":"8 authentication failures detected in 2m.\nSources: 203.0.113.42, 198.51.100.7, 192.0.2.15","count":"8","window":"2m","entity_list":"203.0.113.42, 198.51.100.7, 192.0.2.15"}'
# 6. burst_ip_block
send_test "burst_ip_block" \
'{"type":"burst_ip_block","component":"security","severity":"info","title":"Fail2Ban banned 4 IPs in 5m","body":"4 IPs banned by Fail2Ban in 5m.\nIPs: 203.0.113.42, 198.51.100.7, 192.0.2.15, 10.0.0.99","count":"4","window":"5m","entity_list":"203.0.113.42, 198.51.100.7, 192.0.2.15, 10.0.0.99"}'
}
# ============================================================================
# CLUSTER CATEGORY (group: cluster)
# ============================================================================
test_cluster() {
echo ""
echo -e "${YELLOW}========================================${NC}"
echo -e "${YELLOW} CLUSTER - Quorum, split-brain, HA fencing${NC}"
echo -e "${YELLOW}========================================${NC}"
echo ""
# 1. split_brain
send_test "split_brain" \
'{"type":"split_brain","component":"cluster","severity":"critical","title":"SPLIT-BRAIN detected","body":"Cluster split-brain condition detected.\nQuorum status: No quorum - 1/3 nodes visible","quorum":"No quorum - 1/3 nodes visible"}'
# 2. node_disconnect
send_test "node_disconnect" \
'{"type":"node_disconnect","component":"corosync","severity":"critical","title":"Node disconnected","body":"Node pve-node3 has disconnected from the cluster.","node_name":"pve-node3"}'
# 3. node_reconnect
send_test "node_reconnect" \
'{"type":"node_reconnect","component":"corosync","severity":"info","title":"Node reconnected","body":"Node pve-node3 has reconnected to the cluster.","node_name":"pve-node3"}'
# 4. burst_cluster
send_test "burst_cluster" \
'{"type":"burst_cluster","component":"cluster","severity":"critical","title":"Cluster flapping detected (6 changes)","body":"Cluster state changed 6 times in 5m.\nNodes: pve-node2, pve-node3","count":"6","window":"5m","entity_list":"pve-node2, pve-node3"}'
}
# ============================================================================
# BURST AGGREGATION TESTS (send rapid events to trigger burst detection)
# ============================================================================
test_burst() {
echo ""
echo -e "${YELLOW}========================================${NC}"
echo -e "${YELLOW} BURST - Rapid events to trigger aggregation${NC}"
echo -e "${YELLOW}========================================${NC}"
echo ""
echo -e "${BLUE} Sending 5 rapid auth_fail events (should trigger burst_auth_fail)...${NC}"
for i in $(seq 1 5); do
curl -s -X POST "$API" \
-H "Content-Type: application/json" \
-d "{\"type\":\"auth_fail\",\"component\":\"auth\",\"severity\":\"warning\",\"title\":\"Auth fail from 10.0.0.$i\",\"body\":\"Failed login from 10.0.0.$i\",\"source_ip\":\"10.0.0.$i\"}" > /dev/null
echo -e " ${CYAN}Sent auth_fail $i/5${NC}"
sleep 0.5
done
echo -e " ${GREEN}Done. Wait ~10s for burst aggregation...${NC}"
sleep 10
echo ""
echo -e "${BLUE} Sending 4 rapid disk_io_error events (should trigger burst_disk_io)...${NC}"
for i in $(seq 1 4); do
curl -s -X POST "$API" \
-H "Content-Type: application/json" \
-d "{\"type\":\"disk_io_error\",\"component\":\"smart\",\"severity\":\"critical\",\"title\":\"I/O error on /dev/sd${i}\",\"body\":\"Error on device\",\"device\":\"/dev/sd${i}\"}" > /dev/null
echo -e " ${CYAN}Sent disk_io_error $i/4${NC}"
sleep 0.5
done
echo -e " ${GREEN}Done. Wait ~10s for burst aggregation...${NC}"
sleep 10
echo ""
echo -e "${BLUE} Sending 3 rapid node_disconnect events (should trigger burst_cluster)...${NC}"
for i in $(seq 1 3); do
curl -s -X POST "$API" \
-H "Content-Type: application/json" \
-d "{\"type\":\"node_disconnect\",\"component\":\"corosync\",\"severity\":\"critical\",\"title\":\"Node pve-node$i disconnected\",\"body\":\"Node lost\",\"node_name\":\"pve-node$i\"}" > /dev/null
echo -e " ${CYAN}Sent node_disconnect $i/3${NC}"
sleep 0.5
done
echo -e " ${GREEN}Done. Wait ~10s for burst aggregation...${NC}"
sleep 10
}
# ============================================================================
# MAIN
# ============================================================================
echo ""
echo -e "${BOLD}============================================================${NC}"
echo -e "${BOLD} ProxMenux Notification System - Complete Test Suite${NC}"
echo -e "${BOLD}============================================================${NC}"
echo -e " API: $API"
echo -e " Pause: ${PAUSE}s between tests"
echo ""
# Check that the service is reachable
status=$(curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:8008/api/notifications/status" 2>/dev/null)
if [ "$status" != "200" ]; then
echo -e "${RED}ERROR: Notification service not reachable (HTTP $status)${NC}"
echo -e " Make sure ProxMenux Monitor is running."
exit 1
fi
echo -e "${GREEN}Service is reachable.${NC}"
# Parse argument
category="${1:-all}"
case "$category" in
system) test_system ;;
vm_ct) test_vm_ct ;;
backup) test_backup ;;
resources) test_resources ;;
storage) test_storage ;;
network) test_network ;;
security) test_security ;;
cluster) test_cluster ;;
burst) test_burst ;;
all)
test_system
test_vm_ct
test_backup
test_resources
test_storage
test_network
test_security
test_cluster
test_burst
;;
*)
echo -e "${RED}Unknown category: $category${NC}"
echo "Usage: $0 [system|vm_ct|backup|resources|storage|network|security|cluster|burst|all]"
exit 1
;;
esac
# ============================================================================
# SUMMARY
# ============================================================================
echo ""
echo -e "${BOLD}============================================================${NC}"
echo -e "${BOLD} SUMMARY${NC}"
echo -e "${BOLD}============================================================${NC}"
echo -e " Total tests: $test_count"
echo -e " ${GREEN}Accepted:${NC} $pass_count"
echo -e " ${RED}Rejected:${NC} $fail_count"
echo ""
echo -e " Check your notification channels for the messages."
echo -e " Note: Some events may be filtered by your current settings"
echo -e " (severity filter, disabled categories, disabled individual events)."
echo ""
echo -e " To check notification history (all events):"
echo -e " ${CYAN}curl -s 'http://127.0.0.1:8008/api/notifications/history?limit=200' | python3 -m json.tool${NC}"
echo ""
echo -e " To count events by type:"
echo -e " ${CYAN}curl -s 'http://127.0.0.1:8008/api/notifications/history?limit=200' | python3 -c \"import sys,json; h=json.load(sys.stdin)['history']; [print(f' {t}: {c}') for t,c in sorted(dict((e['event_type'],sum(1 for x in h if x['event_type']==e['event_type'])) for e in h).items())]\"${NC}
echo ""

View File

@@ -1,131 +0,0 @@
#!/usr/bin/env python3
"""
Test script to simulate a disk error and verify observation recording.
Usage: python3 test_disk_observation.py [device_name] [error_type]
Examples:
python3 test_disk_observation.py sdh io_error
python3 test_disk_observation.py sdh smart_error
python3 test_disk_observation.py sdh fs_error
"""
import sys
import os
# Add possible module locations to path
script_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, script_dir)
sys.path.insert(0, '/usr/local/share/proxmenux')
sys.path.insert(0, '/tmp/.mount_ProxMeztyU13/usr/bin') # AppImage mount point
# Try to find the module
for path in sys.path:
if os.path.exists(os.path.join(path, 'health_persistence.py')):
print(f"[INFO] Found health_persistence.py in: {path}")
break
from health_persistence import HealthPersistence
from datetime import datetime
def main():
device_name = sys.argv[1] if len(sys.argv) > 1 else 'sdh'
error_type = sys.argv[2] if len(sys.argv) > 2 else 'io_error'
# Known serial for sdh (WDC 2TB)
serial_map = {
'sdh': 'WD-WX72A30AA72R',
'nvme0n1': '2241E675EA6C',
'nvme1n1': '2241E675EBE6',
'sda': '22440F443504',
'sdb': 'WWZ1SJ18',
'sdc': '52X0A0D9FZ1G',
'sdd': '50026B7784446E63',
'sde': '22440F442105',
'sdf': 'WRQ0X2GP',
'sdg': '23Q0A0MPFZ1G',
}
serial = serial_map.get(device_name, None)
# Error messages by type
error_messages = {
'io_error': f'Test I/O error on /dev/{device_name}: sector read failed at LBA 12345678',
'smart_error': f'/dev/{device_name}: SMART warning - 1 Currently unreadable (pending) sectors detected',
'fs_error': f'EXT4-fs error (device {device_name}1): inode 123456: block 789012: error reading data',
}
error_signatures = {
'io_error': f'io_test_{device_name}',
'smart_error': f'smart_test_{device_name}',
'fs_error': f'fs_test_{device_name}',
}
message = error_messages.get(error_type, f'Test error on /dev/{device_name}')
signature = error_signatures.get(error_type, f'test_{device_name}')
print(f"\n{'='*60}")
print(f"Testing Disk Observation Recording")
print(f"{'='*60}")
print(f"Device: /dev/{device_name}")
print(f"Serial: {serial or 'Unknown'}")
print(f"Error Type: {error_type}")
print(f"Message: {message}")
print(f"Signature: {signature}")
print(f"{'='*60}\n")
# Initialize persistence
hp = HealthPersistence()
# Record the observation
print("[1] Recording observation...")
hp.record_disk_observation(
device_name=device_name,
serial=serial,
error_type=error_type,
error_signature=signature,
raw_message=message,
severity='warning'
)
print(" OK - Observation recorded\n")
# Query observations for this device
print("[2] Querying observations for this device...")
observations = hp.get_disk_observations(device_name=device_name, serial=serial)
if observations:
print(f" Found {len(observations)} observation(s):\n")
for obs in observations:
print(f" ID: {obs['id']}")
print(f" Type: {obs['error_type']}")
print(f" Signature: {obs['error_signature']}")
print(f" Message: {obs['raw_message'][:80]}...")
print(f" Severity: {obs['severity']}")
print(f" First: {obs['first_occurrence']}")
print(f" Last: {obs['last_occurrence']}")
print(f" Count: {obs['occurrence_count']}")
print(f" Dismissed: {obs['dismissed']}")
print()
else:
print(" No observations found!\n")
# Also show the disk registry
print("[3] Checking disk registry...")
all_devices = hp.get_all_observed_devices()
for dev in all_devices:
if dev.get('device_name') == device_name or dev.get('serial') == serial:
print(f" Found in registry:")
print(f" ID: {dev.get('id')}")
print(f" Device: {dev.get('device_name')}")
print(f" Serial: {dev.get('serial')}")
print(f" First seen: {dev.get('first_seen')}")
print(f" Last seen: {dev.get('last_seen')}")
print()
print(f"{'='*60}")
print("Test complete! Check the Storage section in the UI.")
print(f"The disk /dev/{device_name} should now show an observations badge.")
print(f"{'='*60}\n")
if __name__ == '__main__':
main()

View File

@@ -1,732 +0,0 @@
#!/bin/bash
# ============================================================================
# ProxMenux - Real Proxmox Event Simulator
# ============================================================================
# This script triggers ACTUAL events on Proxmox so that PVE's notification
# system fires real webhooks through the full pipeline:
#
# PVE event -> PVE notification -> webhook POST -> our pipeline -> Telegram
#
# Unlike test_all_notifications.sh (which injects directly via API), this
# script makes Proxmox generate the events itself.
#
# Usage:
# chmod +x test_real_events.sh
# ./test_real_events.sh # interactive menu
# ./test_real_events.sh disk # run disk tests only
# ./test_real_events.sh backup # run backup tests only
# ./test_real_events.sh all # run all tests
# ============================================================================
set -euo pipefail
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m'
API="http://127.0.0.1:8008"
LOG_FILE="/tmp/proxmenux_real_test_$(date +%Y%m%d_%H%M%S).log"
# ── Helpers ─────────────────────<E29480><E29480><EFBFBD>───────────────────────────────
log() { echo -e "$1" | tee -a "$LOG_FILE"; }
header() {
echo "" | tee -a "$LOG_FILE"
echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" | tee -a "$LOG_FILE"
echo -e "${BOLD} $1${NC}" | tee -a "$LOG_FILE"
echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" | tee -a "$LOG_FILE"
}
warn() { log "${YELLOW} [!] $1${NC}"; }
ok() { log "${GREEN} [OK] $1${NC}"; }
fail() { log "${RED} [FAIL] $1${NC}"; }
info() { log "${CYAN} [i] $1${NC}"; }
confirm() {
echo ""
echo -e "${YELLOW} $1${NC}"
echo -ne " Continue? [Y/n]: "
read -r ans
[[ -z "$ans" || "$ans" =~ ^[Yy] ]]
}
wait_webhook() {
local seconds=${1:-10}
log " Waiting ${seconds}s for webhook delivery..."
sleep "$seconds"
}
snapshot_history() {
curl -s "${API}/api/notifications/history?limit=200" 2>/dev/null | python3 -c "
import sys, json
try:
data = json.load(sys.stdin)
count = len(data.get('history', []))
print(count)
except:
print(0)
" 2>/dev/null || echo "0"
}
check_new_events() {
local before=$1
local after
after=$(snapshot_history)
local diff=$((after - before))
if [ "$diff" -gt 0 ]; then
ok "Received $diff new notification(s) via webhook"
# Show the latest events
curl -s "${API}/api/notifications/history?limit=$((diff + 2))" 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
for h in data.get('history', [])[:$diff]:
sev = h.get('severity', '?')
icon = {'CRITICAL': ' RED', 'WARNING': ' YEL', 'INFO': ' BLU'}.get(sev, ' ???')
print(f'{icon} {h[\"event_type\"]:25s} {h.get(\"title\", \"\")[:60]}')
" 2>/dev/null | tee -a "$LOG_FILE"
else
warn "No new notifications detected (may need more time or check filters)"
fi
}
# ── Pre-flight checks ──────────────────────────────────────────
preflight() {
header "Pre-flight Checks"
# Check if running as root
if [ "$(id -u)" -ne 0 ]; then
fail "This script must be run as root"
exit 1
fi
ok "Running as root"
# Check ProxMenux is running
if curl -s "${API}/api/health" >/dev/null 2>&1; then
ok "ProxMenux Monitor is running"
else
fail "ProxMenux Monitor not reachable at ${API}"
exit 1
fi
# Check webhook is configured by querying PVE directly
if pvesh get /cluster/notifications/endpoints/webhook --output-format json 2>/dev/null | python3 -c "
import sys, json
endpoints = json.load(sys.stdin)
found = any('proxmenux' in e.get('name','').lower() for e in (endpoints if isinstance(endpoints, list) else [endpoints]))
exit(0 if found else 1)
" 2>/dev/null; then
ok "PVE webhook endpoint 'proxmenux-webhook' is configured"
else
warn "PVE webhook may not be configured. Run setup from the UI first."
if ! confirm "Continue anyway?"; then
exit 1
fi
fi
# Check notification config
# API returns { config: { enabled: true/false/'true'/'false', ... }, success: true }
if curl -s "${API}/api/notifications/settings" 2>/dev/null | python3 -c "
import sys, json
d = json.load(sys.stdin)
cfg = d.get('config', d)
enabled = cfg.get('enabled', False)
exit(0 if enabled is True or str(enabled).lower() == 'true' else 1)
" 2>/dev/null; then
ok "Notifications are enabled"
else
fail "Notifications are NOT enabled. Enable them in the UI first."
exit 1
fi
# Re-run webhook setup to ensure priv config and body template exist
info "Re-configuring PVE webhook (ensures priv config + body template)..."
local setup_result
setup_result=$(curl -s -X POST "${API}/api/notifications/proxmox/setup-webhook" 2>/dev/null)
if echo "$setup_result" | python3 -c "import sys,json; d=json.load(sys.stdin); exit(0 if d.get('configured') else 1)" 2>/dev/null; then
ok "PVE webhook re-configured successfully"
else
local setup_err
setup_err=$(echo "$setup_result" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error','unknown'))" 2>/dev/null)
warn "Webhook setup returned: ${setup_err}"
warn "PVE webhook events may not work. Manual commands below:"
echo "$setup_result" | python3 -c "
import sys, json
d = json.load(sys.stdin)
for cmd in d.get('fallback_commands', []):
print(f' {cmd}')
" 2>/dev/null
if ! confirm "Continue anyway?"; then
exit 1
fi
fi
# Find a VM/CT for testing
VMID=""
VMNAME=""
VMTYPE=""
# Try to find a stopped CT first (safest)
local cts
cts=$(pvesh get /cluster/resources --type vm --output-format json 2>/dev/null || echo "[]")
# Look for a stopped container
VMID=$(echo "$cts" | python3 -c "
import sys, json
vms = json.load(sys.stdin)
# Prefer stopped CTs, then stopped VMs
for v in sorted(vms, key=lambda x: (0 if x.get('type')=='lxc' else 1, 0 if x.get('status')=='stopped' else 1)):
if v.get('status') == 'stopped':
print(v.get('vmid', ''))
break
" 2>/dev/null || echo "")
if [ -n "$VMID" ]; then
VMTYPE=$(echo "$cts" | python3 -c "
import sys, json
vms = json.load(sys.stdin)
for v in vms:
if str(v.get('vmid')) == '$VMID':
print(v.get('type', 'qemu'))
break
" 2>/dev/null)
VMNAME=$(echo "$cts" | python3 -c "
import sys, json
vms = json.load(sys.stdin)
for v in vms:
if str(v.get('vmid')) == '$VMID':
print(v.get('name', 'unknown'))
break
" 2>/dev/null)
ok "Found stopped ${VMTYPE} for testing: ${VMID} (${VMNAME})"
else
warn "No stopped VM/CT found. Backup tests will use ID 0 (host backup)."
fi
# List available storage
info "Available storage:"
pvesh get /storage --output-format json 2>/dev/null | python3 -c "
import sys, json
stores = json.load(sys.stdin)
for s in stores:
sid = s.get('storage', '?')
stype = s.get('type', '?')
content = s.get('content', '?')
print(f' {sid:20s} type={stype:10s} content={content}')
" 2>/dev/null | tee -a "$LOG_FILE" || warn "Could not list storage"
echo ""
log " Log file: ${LOG_FILE}"
}
# ============================================================================
# TEST CATEGORY: DISK ERRORS
# ============================================================================
test_disk() {
header "DISK ERROR TESTS"
# ── Test D1: SMART error injection ──
log ""
log "${BOLD} Test D1: SMART error log injection${NC}"
info "Writes a simulated SMART error to syslog so JournalWatcher catches it."
info "This tests the journal -> notification_events -> pipeline flow."
local before
before=$(snapshot_history)
# Inject a realistic SMART error into the system journal
logger -t kernel -p kern.err "ata1.00: exception Emask 0x0 SAct 0x0 SErr 0x0 action 0x6 frozen"
sleep 1
logger -t kernel -p kern.crit "ata1.00: failed command: READ FPDMA QUEUED"
sleep 1
logger -t smartd -p daemon.warning "Device: /dev/sda [SAT], 1 Currently unreadable (pending) sectors"
wait_webhook 8
check_new_events "$before"
# ── Test D2: ZFS error simulation ──
log ""
log "${BOLD} Test D2: ZFS scrub error simulation${NC}"
# Check if ZFS is available
if command -v zpool >/dev/null 2>&1; then
local zpools
zpools=$(zpool list -H -o name 2>/dev/null || echo "")
if [ -n "$zpools" ]; then
local pool
pool=$(echo "$zpools" | head -1)
info "ZFS pool found: ${pool}"
info "Injecting ZFS checksum error into syslog (non-destructive)."
before=$(snapshot_history)
# Simulate ZFS error events via syslog (non-destructive)
logger -t kernel -p kern.warning "ZFS: pool '${pool}' has experienced an error"
sleep 1
logger -t zfs-module -p daemon.err "CHECKSUM error on ${pool}:mirror-0/sda: zio error"
wait_webhook 8
check_new_events "$before"
else
warn "ZFS installed but no pools found. Skipping ZFS test."
fi
else
warn "ZFS not installed. Skipping ZFS test."
fi
# ── Test D3: Filesystem space pressure ──
log ""
log "${BOLD} Test D3: Disk space pressure simulation${NC}"
info "Creates a large temporary file to fill disk, triggering space warnings."
info "The Health Monitor should detect low disk space within ~60s."
# Check current free space on /
local free_pct
free_pct=$(df / | tail -1 | awk '{print 100-$5}' | tr -d '%')
info "Current free space on /: ${free_pct}%"
if [ "$free_pct" -gt 15 ]; then
info "Disk has ${free_pct}% free. Need to reduce below threshold for test."
# Calculate how much to fill (leave only 8% free)
local total_k free_k fill_k
total_k=$(df / | tail -1 | awk '{print $2}')
free_k=$(df / | tail -1 | awk '{print $4}')
fill_k=$((free_k - (total_k * 8 / 100)))
if [ "$fill_k" -gt 0 ] && [ "$fill_k" -lt 50000000 ]; then
info "Will create ${fill_k}KB temp file to simulate low space."
if confirm "This will temporarily fill disk to ~92% on /. Safe to proceed?"; then
before=$(snapshot_history)
dd if=/dev/zero of=/tmp/.proxmenux_disk_test bs=1024 count="$fill_k" 2>/dev/null || true
ok "Temp file created. Disk pressure active."
info "Waiting 90s for Health Monitor to detect low space..."
# Wait for health monitor polling cycle
for i in $(seq 1 9); do
echo -ne "\r Waiting... ${i}0/90s"
sleep 10
done
echo ""
# Clean up immediately
rm -f /tmp/.proxmenux_disk_test
ok "Temp file removed. Disk space restored."
check_new_events "$before"
else
warn "Skipped disk pressure test."
fi
else
warn "Cannot safely fill disk (would need ${fill_k}KB). Skipping."
fi
else
warn "Disk already at ${free_pct}% free. Health Monitor may already be alerting."
fi
# ── Test D4: I/O error in syslog ──
log ""
log "${BOLD} Test D4: Generic I/O error injection${NC}"
info "Injects I/O errors into syslog for JournalWatcher."
before=$(snapshot_history)
logger -t kernel -p kern.err "Buffer I/O error on dev sdb1, logical block 0, async page read"
sleep 1
logger -t kernel -p kern.err "EXT4-fs error (device sdb1): ext4_find_entry:1455: inode #2: comm ls: reading directory lblock 0"
wait_webhook 8
check_new_events "$before"
}
# ============================================================================
# TEST CATEGORY: BACKUP EVENTS
# ============================================================================
test_backup() {
header "BACKUP EVENT TESTS"
local backup_storage=""
# Find backup-capable storage
backup_storage=$(pvesh get /storage --output-format json 2>/dev/null | python3 -c "
import sys, json
stores = json.load(sys.stdin)
for s in stores:
content = s.get('content', '')
if 'backup' in content or 'vztmpl' in content:
print(s.get('storage', ''))
break
# Fallback: try 'local'
else:
for s in stores:
if s.get('storage') == 'local':
print('local')
break
" 2>/dev/null || echo "local")
info "Using backup storage: ${backup_storage}"
# ── Test B1: Successful vzdump backup ──
if [ -n "$VMID" ]; then
log ""
log "${BOLD} Test B1: Real vzdump backup (success)${NC}"
info "Running a real vzdump backup of ${VMTYPE} ${VMID} (${VMNAME})."
info "This triggers PVE's notification system with a real backup event."
if confirm "This will backup ${VMTYPE} ${VMID} to '${backup_storage}'. Proceed?"; then
local before
before=$(snapshot_history)
# Use snapshot mode for VMs (non-disruptive), stop mode for CTs
local bmode="snapshot"
if [ "$VMTYPE" = "lxc" ]; then
bmode="suspend"
fi
info "Starting vzdump (mode=${bmode}, compress=zstd)..."
if vzdump "$VMID" --storage "$backup_storage" --mode "$bmode" --compress zstd --notes-template "ProxMenux test backup" 2>&1 | tee -a "$LOG_FILE"; then
ok "vzdump completed successfully!"
else
warn "vzdump returned non-zero (check output above)"
fi
wait_webhook 12
check_new_events "$before"
# Clean up the test backup
info "Cleaning up test backup file..."
local latest_bak
latest_bak=$(find "/var/lib/vz/dump/" -name "vzdump-*-${VMID}-*" -type f -newer /tmp/.proxmenux_bak_marker 2>/dev/null | head -1 || echo "")
# Create a marker for cleanup
touch /tmp/.proxmenux_bak_marker 2>/dev/null || true
else
warn "Skipped backup success test."
fi
# ── Test B2: Failed vzdump backup ──
log ""
log "${BOLD} Test B2: vzdump backup failure (invalid storage)${NC}"
info "Attempting backup to non-existent storage to trigger a backup failure event."
before=$(snapshot_history)
# This WILL fail because the storage doesn't exist
info "Starting vzdump to fake storage (will fail intentionally)..."
vzdump "$VMID" --storage "nonexistent_storage_12345" --mode snapshot 2>&1 | tail -5 | tee -a "$LOG_FILE" || true
warn "vzdump failed as expected (this is intentional)."
wait_webhook 12
check_new_events "$before"
else
warn "No VM/CT available for backup tests."
info "You can create a minimal LXC container for testing:"
info " pct create 9999 local:vztmpl/debian-12-standard_12.2-1_amd64.tar.zst --storage local-lvm --memory 128 --cores 1"
fi
# ── Test B3: Snapshot create/delete ──
if [ -n "$VMID" ] && [ "$VMTYPE" = "qemu" ]; then
log ""
log "${BOLD} Test B3: VM Snapshot create & delete${NC}"
info "Creating a snapshot of VM ${VMID} to test snapshot events."
if confirm "Create snapshot 'proxmenux_test' on VM ${VMID}?"; then
local before
before=$(snapshot_history)
if qm snapshot "$VMID" proxmenux_test --description "ProxMenux test snapshot" 2>&1 | tee -a "$LOG_FILE"; then
ok "Snapshot created!"
else
warn "Snapshot creation returned non-zero"
fi
wait_webhook 10
check_new_events "$before"
# Clean up snapshot
info "Cleaning up test snapshot..."
qm delsnapshot "$VMID" proxmenux_test 2>/dev/null || true
ok "Snapshot removed."
fi
elif [ -n "$VMID" ] && [ "$VMTYPE" = "lxc" ]; then
log ""
log "${BOLD} Test B3: CT Snapshot create & delete${NC}"
info "Creating a snapshot of CT ${VMID}."
if confirm "Create snapshot 'proxmenux_test' on CT ${VMID}?"; then
local before
before=$(snapshot_history)
if pct snapshot "$VMID" proxmenux_test --description "ProxMenux test snapshot" 2>&1 | tee -a "$LOG_FILE"; then
ok "Snapshot created!"
else
warn "Snapshot creation returned non-zero"
fi
wait_webhook 10
check_new_events "$before"
# Clean up
info "Cleaning up test snapshot..."
pct delsnapshot "$VMID" proxmenux_test 2>/dev/null || true
ok "Snapshot removed."
fi
fi
# ── Test B4: PVE scheduled backup notification ──
log ""
log "${BOLD} Test B4: Trigger PVE notification system directly${NC}"
info "Using 'pvesh create /notifications/endpoints/...' to test PVE's own system."
info "This sends a test notification through PVE, which should hit our webhook."
local before
before=$(snapshot_history)
# PVE 8.x has a test endpoint for notifications
if pvesh create /notifications/targets/test --target proxmenux-webhook 2>&1 | tee -a "$LOG_FILE"; then
ok "PVE test notification sent!"
else
# Try alternative method
info "Direct test not available. Trying via API..."
pvesh set /notifications/endpoints/webhook/proxmenux-webhook --test 1 2>/dev/null || \
warn "Could not send PVE test notification (requires PVE 8.1+)"
fi
wait_webhook 8
check_new_events "$before"
}
# ============================================================================
# TEST CATEGORY: VM/CT LIFECYCLE
# ============================================================================
test_vmct() {
header "VM/CT LIFECYCLE TESTS"
if [ -z "$VMID" ]; then
warn "No stopped VM/CT found for lifecycle tests."
info "Create a minimal CT: pct create 9999 local:vztmpl/debian-12-standard_12.2-1_amd64.tar.zst --storage local-lvm --memory 128 --cores 1"
return
fi
log ""
log "${BOLD} Test V1: Start ${VMTYPE} ${VMID} (${VMNAME})${NC}"
if confirm "Start ${VMTYPE} ${VMID}? It will be stopped again after the test."; then
local before
before=$(snapshot_history)
if [ "$VMTYPE" = "lxc" ]; then
pct start "$VMID" 2>&1 | tee -a "$LOG_FILE" || true
else
qm start "$VMID" 2>&1 | tee -a "$LOG_FILE" || true
fi
ok "Start command sent."
wait_webhook 10
check_new_events "$before"
# Wait a moment
sleep 5
# ── Test V2: Stop ──
log ""
log "${BOLD} Test V2: Stop ${VMTYPE} ${VMID}${NC}"
before=$(snapshot_history)
if [ "$VMTYPE" = "lxc" ]; then
pct stop "$VMID" 2>&1 | tee -a "$LOG_FILE" || true
else
qm stop "$VMID" 2>&1 | tee -a "$LOG_FILE" || true
fi
ok "Stop command sent."
wait_webhook 10
check_new_events "$before"
fi
}
# ============================================================================
# TEST CATEGORY: SYSTEM EVENTS (via syslog injection)
# ============================================================================
test_system() {
header "SYSTEM EVENT TESTS (syslog injection)"
# ── Test S1: Authentication failures ──
log ""
log "${BOLD} Test S1: SSH auth failure injection${NC}"
info "Injecting SSH auth failure messages into syslog."
local before
before=$(snapshot_history)
logger -t sshd -p auth.warning "Failed password for root from 192.168.1.200 port 44312 ssh2"
sleep 2
logger -t sshd -p auth.warning "Failed password for invalid user admin from 10.0.0.50 port 55123 ssh2"
sleep 2
logger -t sshd -p auth.warning "Failed password for root from 192.168.1.200 port 44315 ssh2"
wait_webhook 8
check_new_events "$before"
# ── Test S2: Firewall event ──
log ""
log "${BOLD} Test S2: Firewall drop event${NC}"
before=$(snapshot_history)
logger -t kernel -p kern.warning "pve-fw-reject: IN=vmbr0 OUT= MAC=00:11:22:33:44:55 SRC=10.0.0.99 DST=192.168.1.1 PROTO=TCP DPT=22 REJECT"
sleep 2
logger -t pvefw -p daemon.warning "firewall: blocked incoming connection from 10.0.0.99:45678 to 192.168.1.1:8006"
wait_webhook 8
check_new_events "$before"
# ── Test S3: Service failure ──
log ""
log "${BOLD} Test S3: Service failure injection${NC}"
before=$(snapshot_history)
logger -t systemd -p daemon.err "pvedaemon.service: Main process exited, code=exited, status=1/FAILURE"
sleep 1
logger -t systemd -p daemon.err "Failed to start Proxmox VE API Daemon."
wait_webhook 8
check_new_events "$before"
}
# ============================================================================
# SUMMARY & REPORT
# ============================================================================
show_summary() {
header "TEST SUMMARY"
info "Fetching full notification history..."
echo ""
curl -s "${API}/api/notifications/history?limit=200" 2>/dev/null | python3 -c "
import sys, json
from collections import Counter
data = json.load(sys.stdin)
history = data.get('history', [])
if not history:
print(' No notifications in history.')
sys.exit(0)
# Group by event_type
by_type = Counter(h['event_type'] for h in history)
# Group by severity
by_sev = Counter(h.get('severity', '?') for h in history)
# Group by source
by_src = Counter(h.get('source', '?') for h in history)
print(f' Total notifications: {len(history)}')
print()
sev_icons = {'CRITICAL': '\033[0;31mCRITICAL\033[0m', 'WARNING': '\033[1;33mWARNING\033[0m', 'INFO': '\033[0;36mINFO\033[0m'}
print(' By severity:')
for sev, count in by_sev.most_common():
icon = sev_icons.get(sev, sev)
print(f' {icon}: {count}')
print()
print(' By source:')
for src, count in by_src.most_common():
print(f' {src:20s}: {count}')
print()
print(' By event type:')
for etype, count in by_type.most_common():
print(f' {etype:30s}: {count}')
print()
print(' Latest 15 events:')
for h in history[:15]:
sev = h.get('severity', '?')
icon = {'CRITICAL': ' \033[0;31mRED\033[0m', 'WARNING': ' \033[1;33mYEL\033[0m', 'INFO': ' \033[0;36mBLU\033[0m'}.get(sev, ' ???')
ts = h.get('sent_at', '?')[:19]
src = h.get('source', '?')[:12]
print(f' {icon} {ts} {src:12s} {h[\"event_type\"]:25s} {h.get(\"title\", \"\")[:50]}')
" 2>/dev/null | tee -a "$LOG_FILE"
echo ""
info "Full log saved to: ${LOG_FILE}"
echo ""
info "To see all history:"
echo -e " ${CYAN}curl -s '${API}/api/notifications/history?limit=200' | python3 -m json.tool${NC}"
echo ""
info "To check Telegram delivery, look at your Telegram bot chat."
}
# ============================================================================
# INTERACTIVE MENU
# ============================================================================
show_menu() {
echo ""
echo -e "${BOLD} ProxMenux Real Event Test Suite${NC}"
echo ""
echo -e " ${CYAN}1)${NC} Disk error tests (SMART, ZFS, I/O, space pressure)"
echo -e " ${CYAN}2)${NC} Backup tests (vzdump success/fail, snapshots)"
echo -e " ${CYAN}3)${NC} VM/CT lifecycle tests (start/stop real VMs)"
echo -e " ${CYAN}4)${NC} System event tests (auth, firewall, service failures)"
echo -e " ${CYAN}5)${NC} Run ALL tests"
echo -e " ${CYAN}6)${NC} Show summary report"
echo -e " ${CYAN}q)${NC} Exit"
echo ""
echo -ne " Select: "
}
# ── Main ────────────────────────────────────────────────────────
main() {
local mode="${1:-menu}"
echo ""
echo -e "${BOLD}============================================================${NC}"
echo -e "${BOLD} ProxMenux - Real Proxmox Event Simulator${NC}"
echo -e "${BOLD}============================================================${NC}"
echo -e " Tests REAL events through the full PVE -> webhook pipeline."
echo -e " Log file: ${CYAN}${LOG_FILE}${NC}"
echo ""
preflight
case "$mode" in
disk) test_disk; show_summary ;;
backup) test_backup; show_summary ;;
vmct) test_vmct; show_summary ;;
system) test_system; show_summary ;;
all)
test_disk
test_backup
test_vmct
test_system
show_summary
;;
menu|*)
while true; do
show_menu
read -r choice
case "$choice" in
1) test_disk ;;
2) test_backup ;;
3) test_vmct ;;
4) test_system ;;
5) test_disk; test_backup; test_vmct; test_system; show_summary; break ;;
6) show_summary ;;
q|Q) echo " Bye!"; break ;;
*) warn "Invalid option" ;;
esac
done
;;
esac
}
main "${1:-menu}"