mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-18 10:02:16 +00:00
Update notification service
This commit is contained in:
375
AppImage/scripts/ai_context_enrichment.py
Normal file
375
AppImage/scripts/ai_context_enrichment.py
Normal file
@@ -0,0 +1,375 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AI Context Enrichment Module
|
||||
|
||||
Enriches notification context with additional information to help AI provide
|
||||
more accurate and helpful responses:
|
||||
|
||||
1. Event frequency - how often this error has occurred
|
||||
2. System uptime - helps distinguish startup issues from runtime failures
|
||||
3. SMART disk data - for disk-related errors
|
||||
4. Known error matching - from proxmox_known_errors database
|
||||
|
||||
Author: MacRimi
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, Dict, Any
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
# Import known errors database
|
||||
try:
|
||||
from proxmox_known_errors import get_error_context, find_matching_error
|
||||
except ImportError:
|
||||
def get_error_context(*args, **kwargs):
|
||||
return None
|
||||
def find_matching_error(*args, **kwargs):
|
||||
return None
|
||||
|
||||
DB_PATH = Path('/usr/local/share/proxmenux/health_monitor.db')
|
||||
|
||||
|
||||
def get_system_uptime() -> str:
|
||||
"""Get system uptime in human-readable format.
|
||||
|
||||
Returns:
|
||||
String like "2 minutes (recently booted)" or "89 days, 4 hours (stable system)"
|
||||
"""
|
||||
try:
|
||||
with open('/proc/uptime', 'r') as f:
|
||||
uptime_seconds = float(f.readline().split()[0])
|
||||
|
||||
days = int(uptime_seconds // 86400)
|
||||
hours = int((uptime_seconds % 86400) // 3600)
|
||||
minutes = int((uptime_seconds % 3600) // 60)
|
||||
|
||||
# Build human-readable string
|
||||
parts = []
|
||||
if days > 0:
|
||||
parts.append(f"{days} day{'s' if days != 1 else ''}")
|
||||
if hours > 0:
|
||||
parts.append(f"{hours} hour{'s' if hours != 1 else ''}")
|
||||
if not parts: # Less than an hour
|
||||
parts.append(f"{minutes} minute{'s' if minutes != 1 else ''}")
|
||||
|
||||
uptime_str = ", ".join(parts)
|
||||
|
||||
# Add context hint
|
||||
if uptime_seconds < 600: # Less than 10 minutes
|
||||
return f"{uptime_str} (just booted - likely startup issue)"
|
||||
elif uptime_seconds < 3600: # Less than 1 hour
|
||||
return f"{uptime_str} (recently booted)"
|
||||
elif days >= 30:
|
||||
return f"{uptime_str} (stable system)"
|
||||
else:
|
||||
return uptime_str
|
||||
|
||||
except Exception:
|
||||
return "unknown"
|
||||
|
||||
|
||||
def get_event_frequency(error_id: str = None, error_key: str = None,
|
||||
category: str = None, hours: int = 24) -> Optional[Dict[str, Any]]:
|
||||
"""Get frequency information for an error from the database.
|
||||
|
||||
Args:
|
||||
error_id: Specific error ID to look up
|
||||
error_key: Alternative error key
|
||||
category: Error category
|
||||
hours: Time window to check (default 24h)
|
||||
|
||||
Returns:
|
||||
Dict with frequency info or None
|
||||
"""
|
||||
if not DB_PATH.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(str(DB_PATH), timeout=5)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Try to find the error
|
||||
if error_id:
|
||||
cursor.execute('''
|
||||
SELECT first_seen, last_seen, occurrences, category
|
||||
FROM errors WHERE error_key = ? OR error_id = ?
|
||||
ORDER BY last_seen DESC LIMIT 1
|
||||
''', (error_id, error_id))
|
||||
elif error_key:
|
||||
cursor.execute('''
|
||||
SELECT first_seen, last_seen, occurrences, category
|
||||
FROM errors WHERE error_key = ?
|
||||
ORDER BY last_seen DESC LIMIT 1
|
||||
''', (error_key,))
|
||||
elif category:
|
||||
cursor.execute('''
|
||||
SELECT first_seen, last_seen, occurrences, category
|
||||
FROM errors WHERE category = ? AND resolved_at IS NULL
|
||||
ORDER BY last_seen DESC LIMIT 1
|
||||
''', (category,))
|
||||
else:
|
||||
conn.close()
|
||||
return None
|
||||
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
first_seen, last_seen, occurrences, cat = row
|
||||
|
||||
# Calculate age
|
||||
try:
|
||||
first_dt = datetime.fromisoformat(first_seen) if first_seen else None
|
||||
last_dt = datetime.fromisoformat(last_seen) if last_seen else None
|
||||
now = datetime.now()
|
||||
|
||||
result = {
|
||||
'occurrences': occurrences or 1,
|
||||
'category': cat
|
||||
}
|
||||
|
||||
if first_dt:
|
||||
age = now - first_dt
|
||||
if age.total_seconds() < 3600:
|
||||
result['first_seen_ago'] = f"{int(age.total_seconds() / 60)} minutes ago"
|
||||
elif age.total_seconds() < 86400:
|
||||
result['first_seen_ago'] = f"{int(age.total_seconds() / 3600)} hours ago"
|
||||
else:
|
||||
result['first_seen_ago'] = f"{age.days} days ago"
|
||||
|
||||
if last_dt and first_dt and occurrences and occurrences > 1:
|
||||
# Calculate average interval
|
||||
span = (last_dt - first_dt).total_seconds()
|
||||
if span > 0 and occurrences > 1:
|
||||
avg_interval = span / (occurrences - 1)
|
||||
if avg_interval < 60:
|
||||
result['pattern'] = f"recurring every ~{int(avg_interval)} seconds"
|
||||
elif avg_interval < 3600:
|
||||
result['pattern'] = f"recurring every ~{int(avg_interval / 60)} minutes"
|
||||
else:
|
||||
result['pattern'] = f"recurring every ~{int(avg_interval / 3600)} hours"
|
||||
|
||||
return result
|
||||
|
||||
except (ValueError, TypeError):
|
||||
return {'occurrences': occurrences or 1, 'category': cat}
|
||||
|
||||
except Exception as e:
|
||||
print(f"[AIContext] Error getting frequency: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_smart_data(disk_device: str) -> Optional[str]:
|
||||
"""Get SMART health data for a disk.
|
||||
|
||||
Args:
|
||||
disk_device: Device path like /dev/sda or just sda
|
||||
|
||||
Returns:
|
||||
Formatted SMART summary or None
|
||||
"""
|
||||
if not disk_device:
|
||||
return None
|
||||
|
||||
# Normalize device path
|
||||
if not disk_device.startswith('/dev/'):
|
||||
disk_device = f'/dev/{disk_device}'
|
||||
|
||||
# Check device exists
|
||||
if not os.path.exists(disk_device):
|
||||
return None
|
||||
|
||||
try:
|
||||
# Get health status
|
||||
result = subprocess.run(
|
||||
['smartctl', '-H', disk_device],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
|
||||
health_status = "UNKNOWN"
|
||||
if "PASSED" in result.stdout:
|
||||
health_status = "PASSED"
|
||||
elif "FAILED" in result.stdout:
|
||||
health_status = "FAILED"
|
||||
|
||||
# Get key attributes
|
||||
result = subprocess.run(
|
||||
['smartctl', '-A', disk_device],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
|
||||
attributes = {}
|
||||
critical_attrs = [
|
||||
'Reallocated_Sector_Ct', 'Current_Pending_Sector',
|
||||
'Offline_Uncorrectable', 'UDMA_CRC_Error_Count',
|
||||
'Reallocated_Event_Count', 'Reported_Uncorrect'
|
||||
]
|
||||
|
||||
for line in result.stdout.split('\n'):
|
||||
for attr in critical_attrs:
|
||||
if attr in line:
|
||||
parts = line.split()
|
||||
# Typical format: ID ATTRIBUTE_NAME FLAGS VALUE WORST THRESH TYPE UPDATED RAW_VALUE
|
||||
if len(parts) >= 10:
|
||||
raw_value = parts[-1]
|
||||
attributes[attr] = raw_value
|
||||
|
||||
# Build summary
|
||||
lines = [f"SMART Health: {health_status}"]
|
||||
|
||||
# Add critical attributes if non-zero
|
||||
for attr, value in attributes.items():
|
||||
try:
|
||||
if int(value) > 0:
|
||||
lines.append(f" {attr}: {value}")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return "\n".join(lines) if len(lines) > 1 or health_status == "FAILED" else f"SMART Health: {health_status}"
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return None
|
||||
except FileNotFoundError:
|
||||
# smartctl not installed
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def extract_disk_device(text: str) -> Optional[str]:
|
||||
"""Extract disk device name from error text.
|
||||
|
||||
Args:
|
||||
text: Error message or log content
|
||||
|
||||
Returns:
|
||||
Device name like 'sda' or None
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
# Common patterns for disk devices in errors
|
||||
patterns = [
|
||||
r'/dev/(sd[a-z]\d*)',
|
||||
r'/dev/(nvme\d+n\d+(?:p\d+)?)',
|
||||
r'/dev/(hd[a-z]\d*)',
|
||||
r'/dev/(vd[a-z]\d*)',
|
||||
r'\b(sd[a-z])\b',
|
||||
r'disk[_\s]+(sd[a-z])',
|
||||
r'ata\d+\.\d+: (sd[a-z])',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def enrich_context_for_ai(
|
||||
title: str,
|
||||
body: str,
|
||||
event_type: str,
|
||||
data: Dict[str, Any],
|
||||
journal_context: str = '',
|
||||
detail_level: str = 'standard'
|
||||
) -> str:
|
||||
"""Build enriched context string for AI processing.
|
||||
|
||||
Combines:
|
||||
- Original journal context
|
||||
- Event frequency information
|
||||
- System uptime
|
||||
- SMART data (for disk errors)
|
||||
- Known error matching
|
||||
|
||||
Args:
|
||||
title: Notification title
|
||||
body: Notification body
|
||||
event_type: Type of event
|
||||
data: Event data dict
|
||||
journal_context: Original journal log context
|
||||
detail_level: Level of detail (minimal, standard, detailed)
|
||||
|
||||
Returns:
|
||||
Enriched context string
|
||||
"""
|
||||
context_parts = []
|
||||
combined_text = f"{title} {body} {journal_context}"
|
||||
|
||||
# 1. System uptime (always useful)
|
||||
uptime = get_system_uptime()
|
||||
if uptime and uptime != "unknown":
|
||||
context_parts.append(f"System uptime: {uptime}")
|
||||
|
||||
# 2. Event frequency
|
||||
error_key = data.get('error_key') or data.get('error_id')
|
||||
category = data.get('category')
|
||||
|
||||
freq = get_event_frequency(error_id=error_key, category=category)
|
||||
if freq:
|
||||
freq_line = f"Event frequency: {freq.get('occurrences', 1)} occurrence(s)"
|
||||
if freq.get('first_seen_ago'):
|
||||
freq_line += f", first seen {freq['first_seen_ago']}"
|
||||
if freq.get('pattern'):
|
||||
freq_line += f", {freq['pattern']}"
|
||||
context_parts.append(freq_line)
|
||||
|
||||
# 3. SMART data for disk-related events
|
||||
disk_related = any(x in event_type.lower() for x in ['disk', 'smart', 'storage', 'io_error'])
|
||||
if not disk_related:
|
||||
disk_related = any(x in combined_text.lower() for x in ['disk', 'smart', '/dev/sd', 'ata', 'i/o error'])
|
||||
|
||||
if disk_related:
|
||||
disk_device = extract_disk_device(combined_text)
|
||||
if disk_device:
|
||||
smart_data = get_smart_data(disk_device)
|
||||
if smart_data:
|
||||
context_parts.append(smart_data)
|
||||
|
||||
# 4. Known error matching
|
||||
known_error_ctx = get_error_context(combined_text, category=category, detail_level=detail_level)
|
||||
if known_error_ctx:
|
||||
context_parts.append(known_error_ctx)
|
||||
|
||||
# 5. Add original journal context
|
||||
if journal_context:
|
||||
context_parts.append(f"Journal logs:\n{journal_context}")
|
||||
|
||||
# Combine all parts
|
||||
if context_parts:
|
||||
return "\n\n".join(context_parts)
|
||||
|
||||
return journal_context or ""
|
||||
|
||||
|
||||
def get_enriched_context(
|
||||
event: 'NotificationEvent',
|
||||
detail_level: str = 'standard'
|
||||
) -> str:
|
||||
"""Convenience function to enrich context from a NotificationEvent.
|
||||
|
||||
Args:
|
||||
event: NotificationEvent object
|
||||
detail_level: Level of detail
|
||||
|
||||
Returns:
|
||||
Enriched context string
|
||||
"""
|
||||
journal_context = event.data.get('_journal_context', '')
|
||||
|
||||
return enrich_context_for_ai(
|
||||
title=event.data.get('title', ''),
|
||||
body=event.data.get('body', event.data.get('message', '')),
|
||||
event_type=event.event_type,
|
||||
data=event.data,
|
||||
journal_context=journal_context,
|
||||
detail_level=detail_level
|
||||
)
|
||||
@@ -95,6 +95,8 @@ cp "$SCRIPT_DIR/notification_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo
|
||||
cp "$SCRIPT_DIR/notification_channels.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_channels.py not found"
|
||||
cp "$SCRIPT_DIR/notification_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_templates.py not found"
|
||||
cp "$SCRIPT_DIR/notification_events.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_events.py not found"
|
||||
cp "$SCRIPT_DIR/proxmox_known_errors.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ proxmox_known_errors.py not found"
|
||||
cp "$SCRIPT_DIR/ai_context_enrichment.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ ai_context_enrichment.py not found"
|
||||
cp "$SCRIPT_DIR/startup_grace.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ startup_grace.py not found"
|
||||
cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_notification_routes.py not found"
|
||||
cp "$SCRIPT_DIR/oci_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ oci_manager.py not found"
|
||||
|
||||
@@ -862,6 +862,307 @@ class HealthPersistence:
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
# Clean up errors for resources that no longer exist (VMs/CTs deleted, disks removed)
|
||||
self._cleanup_stale_resources()
|
||||
|
||||
def _cleanup_stale_resources(self):
|
||||
"""Resolve errors for resources that no longer exist.
|
||||
|
||||
Comprehensive cleanup for ALL error categories:
|
||||
- VMs/CTs: deleted resources (not just stopped)
|
||||
- Disks: physically removed devices, ZFS pools, storage
|
||||
- Network: removed interfaces, bonds, bridges
|
||||
- Services/pve_services: services on deleted CTs, stopped services
|
||||
- Logs: persistent/spike/cascade errors older than 48h
|
||||
- Cluster: errors when node is no longer in cluster
|
||||
- Temperature: sensors that no longer exist
|
||||
- Memory/Storage: mount points that no longer exist
|
||||
- Updates/Security: acknowledged errors older than 7 days
|
||||
- General fallback: any error older than 7 days with no recent activity
|
||||
"""
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
conn = self._get_conn()
|
||||
cursor = conn.cursor()
|
||||
now = datetime.now()
|
||||
now_iso = now.isoformat()
|
||||
|
||||
# Get all active (unresolved) errors with first_seen and last_seen for age checks
|
||||
cursor.execute('''
|
||||
SELECT id, error_key, category, message, first_seen, last_seen, severity FROM errors
|
||||
WHERE resolved_at IS NULL
|
||||
''')
|
||||
active_errors = cursor.fetchall()
|
||||
|
||||
resolved_count = 0
|
||||
|
||||
# Cache for expensive checks (avoid repeated subprocess calls)
|
||||
_vm_ct_exists_cache = {}
|
||||
_cluster_status_cache = None
|
||||
_network_interfaces_cache = None
|
||||
_zfs_pools_cache = None
|
||||
_mount_points_cache = None
|
||||
_pve_services_cache = None
|
||||
|
||||
def check_vm_ct_cached(vmid):
|
||||
if vmid not in _vm_ct_exists_cache:
|
||||
_vm_ct_exists_cache[vmid] = self._check_vm_ct_exists(vmid)
|
||||
return _vm_ct_exists_cache[vmid]
|
||||
|
||||
def get_cluster_status():
|
||||
nonlocal _cluster_status_cache
|
||||
if _cluster_status_cache is None:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['pvecm', 'status'],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
_cluster_status_cache = {
|
||||
'is_cluster': result.returncode == 0 and 'Cluster information' in result.stdout,
|
||||
'nodes': result.stdout if result.returncode == 0 else ''
|
||||
}
|
||||
except Exception:
|
||||
_cluster_status_cache = {'is_cluster': True, 'nodes': ''} # Assume cluster on error
|
||||
return _cluster_status_cache
|
||||
|
||||
def get_network_interfaces():
|
||||
nonlocal _network_interfaces_cache
|
||||
if _network_interfaces_cache is None:
|
||||
try:
|
||||
import psutil
|
||||
_network_interfaces_cache = set(psutil.net_if_stats().keys())
|
||||
except Exception:
|
||||
_network_interfaces_cache = set()
|
||||
return _network_interfaces_cache
|
||||
|
||||
def get_zfs_pools():
|
||||
nonlocal _zfs_pools_cache
|
||||
if _zfs_pools_cache is None:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['zpool', 'list', '-H', '-o', 'name'],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
if result.returncode == 0:
|
||||
_zfs_pools_cache = set(result.stdout.strip().split('\n'))
|
||||
else:
|
||||
_zfs_pools_cache = set()
|
||||
except Exception:
|
||||
_zfs_pools_cache = set()
|
||||
return _zfs_pools_cache
|
||||
|
||||
def get_mount_points():
|
||||
nonlocal _mount_points_cache
|
||||
if _mount_points_cache is None:
|
||||
try:
|
||||
import psutil
|
||||
_mount_points_cache = set(p.mountpoint for p in psutil.disk_partitions(all=True))
|
||||
except Exception:
|
||||
_mount_points_cache = set()
|
||||
return _mount_points_cache
|
||||
|
||||
def get_pve_services_status():
|
||||
nonlocal _pve_services_cache
|
||||
if _pve_services_cache is None:
|
||||
_pve_services_cache = {}
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['systemctl', 'list-units', '--type=service', '--all', '--no-legend'],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
if result.returncode == 0:
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
parts = line.split()
|
||||
if parts:
|
||||
service_name = parts[0].replace('.service', '')
|
||||
_pve_services_cache[service_name] = 'active' in line
|
||||
except Exception:
|
||||
pass
|
||||
return _pve_services_cache
|
||||
|
||||
def extract_vmid_from_text(text):
|
||||
"""Extract VM/CT ID from error message or key."""
|
||||
if not text:
|
||||
return None
|
||||
# Patterns: "VM 100", "CT 100", "vm_100_", "ct_100_", "VMID 100", etc.
|
||||
match = re.search(r'(?:VM|CT|VMID|CTID|vm_|ct_)[\s_]?(\d{3,})', text, re.IGNORECASE)
|
||||
return match.group(1) if match else None
|
||||
|
||||
def get_age_hours(timestamp_str):
|
||||
"""Get age in hours from ISO timestamp string."""
|
||||
if not timestamp_str:
|
||||
return 0
|
||||
try:
|
||||
dt = datetime.fromisoformat(timestamp_str)
|
||||
return (now - dt).total_seconds() / 3600
|
||||
except (ValueError, TypeError):
|
||||
return 0
|
||||
|
||||
for error_row in active_errors:
|
||||
err_id, error_key, category, message, first_seen, last_seen, severity = error_row
|
||||
should_resolve = False
|
||||
resolution_reason = None
|
||||
age_hours = get_age_hours(first_seen)
|
||||
last_seen_hours = get_age_hours(last_seen)
|
||||
|
||||
# === VM/CT ERRORS ===
|
||||
# Check if VM/CT still exists (covers: vms category, vm_*, ct_* error keys)
|
||||
if category == 'vms' or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_'))):
|
||||
vmid = extract_vmid_from_text(error_key) or extract_vmid_from_text(message)
|
||||
if vmid and not check_vm_ct_cached(vmid):
|
||||
should_resolve = True
|
||||
resolution_reason = 'VM/CT deleted'
|
||||
|
||||
# === DISK ERRORS ===
|
||||
# Check if disk device or ZFS pool still exists
|
||||
elif category == 'disks' or category == 'storage':
|
||||
if error_key:
|
||||
# Check for ZFS pool errors (e.g., "zfs_pool_rpool_degraded")
|
||||
zfs_match = re.search(r'zfs_(?:pool_)?([a-zA-Z0-9_-]+)', error_key)
|
||||
if zfs_match:
|
||||
pool_name = zfs_match.group(1)
|
||||
pools = get_zfs_pools()
|
||||
if pools and pool_name not in pools:
|
||||
should_resolve = True
|
||||
resolution_reason = 'ZFS pool removed'
|
||||
|
||||
# Check for disk device errors (e.g., "disk_sdh_io_error", "smart_sda_failing")
|
||||
if not should_resolve:
|
||||
disk_match = re.search(r'(?:disk_|smart_|io_error_)([a-z]{2,4}\d*)', error_key)
|
||||
if disk_match:
|
||||
disk_name = disk_match.group(1)
|
||||
disk_path = f'/dev/{disk_name}'
|
||||
if not os.path.exists(disk_path):
|
||||
should_resolve = True
|
||||
resolution_reason = 'Disk device removed'
|
||||
|
||||
# Check for mount point errors (e.g., "disk_fs_/mnt/data")
|
||||
if not should_resolve and 'disk_fs_' in error_key:
|
||||
mount = error_key.replace('disk_fs_', '').split('_')[0]
|
||||
if mount.startswith('/'):
|
||||
mounts = get_mount_points()
|
||||
if mounts and mount not in mounts:
|
||||
should_resolve = True
|
||||
resolution_reason = 'Mount point removed'
|
||||
|
||||
# === NETWORK ERRORS ===
|
||||
# Check if network interface still exists
|
||||
elif category == 'network':
|
||||
if error_key:
|
||||
# Extract interface name (e.g., "net_vmbr1_down" -> "vmbr1", "bond0_slave_error" -> "bond0")
|
||||
iface_match = re.search(r'(?:net_|bond_|vmbr|eth|eno|ens|enp)([a-zA-Z0-9_]+)?', error_key)
|
||||
if iface_match:
|
||||
# Reconstruct full interface name
|
||||
full_match = re.search(r'((?:vmbr|bond|eth|eno|ens|enp)[a-zA-Z0-9]+)', error_key)
|
||||
if full_match:
|
||||
iface = full_match.group(1)
|
||||
interfaces = get_network_interfaces()
|
||||
if interfaces and iface not in interfaces:
|
||||
should_resolve = True
|
||||
resolution_reason = 'Network interface removed'
|
||||
|
||||
# === SERVICE ERRORS ===
|
||||
# Check if service exists or if it references a deleted CT
|
||||
elif category in ('services', 'pve_services'):
|
||||
# First check if it references a CT that no longer exists
|
||||
vmid = extract_vmid_from_text(message) or extract_vmid_from_text(error_key)
|
||||
if vmid and not check_vm_ct_cached(vmid):
|
||||
should_resolve = True
|
||||
resolution_reason = 'Container deleted'
|
||||
|
||||
# For pve_services, check if the service unit exists
|
||||
if not should_resolve and category == 'pve_services' and error_key:
|
||||
service_match = re.search(r'service_([a-zA-Z0-9_-]+)', error_key)
|
||||
if service_match:
|
||||
service_name = service_match.group(1)
|
||||
services = get_pve_services_status()
|
||||
if services and service_name not in services:
|
||||
should_resolve = True
|
||||
resolution_reason = 'Service no longer exists'
|
||||
|
||||
# === LOG ERRORS ===
|
||||
# Auto-resolve log errors after 48h (they represent point-in-time issues)
|
||||
elif category == 'logs' or (error_key and error_key.startswith(('log_persistent_', 'log_spike_', 'log_cascade_', 'log_critical_'))):
|
||||
if age_hours > 48:
|
||||
should_resolve = True
|
||||
resolution_reason = 'Log error aged out (>48h)'
|
||||
|
||||
# === CLUSTER ERRORS ===
|
||||
# Resolve cluster/corosync/qdevice errors if node is no longer in a cluster
|
||||
elif error_key and any(x in error_key.lower() for x in ('cluster', 'corosync', 'qdevice', 'quorum')):
|
||||
cluster_info = get_cluster_status()
|
||||
if not cluster_info['is_cluster']:
|
||||
should_resolve = True
|
||||
resolution_reason = 'No longer in cluster'
|
||||
|
||||
# === TEMPERATURE ERRORS ===
|
||||
# Temperature errors - check if sensor still exists (unlikely to change, resolve after 24h of no activity)
|
||||
elif category == 'temperature':
|
||||
if last_seen_hours > 24:
|
||||
should_resolve = True
|
||||
resolution_reason = 'Temperature error stale (>24h no activity)'
|
||||
|
||||
# === UPDATES/SECURITY ERRORS ===
|
||||
# These are informational - auto-resolve after 7 days if acknowledged or stale
|
||||
elif category in ('updates', 'security'):
|
||||
if age_hours > 168: # 7 days
|
||||
should_resolve = True
|
||||
resolution_reason = 'Update/security notice aged out (>7d)'
|
||||
|
||||
# === FALLBACK: ANY STALE ERROR ===
|
||||
# Any error that hasn't been seen in 7 days and is older than 7 days
|
||||
if not should_resolve and age_hours > 168 and last_seen_hours > 168:
|
||||
should_resolve = True
|
||||
resolution_reason = 'Stale error (no activity >7d)'
|
||||
|
||||
if should_resolve:
|
||||
cursor.execute('''
|
||||
UPDATE errors SET resolved_at = ?, resolution_type = 'auto'
|
||||
WHERE id = ?
|
||||
''', (now_iso, err_id))
|
||||
resolved_count += 1
|
||||
|
||||
if resolved_count > 0:
|
||||
conn.commit()
|
||||
print(f"[HealthPersistence] Auto-resolved {resolved_count} errors for stale/deleted resources")
|
||||
|
||||
conn.close()
|
||||
|
||||
def _check_vm_ct_exists(self, vmid: str) -> bool:
|
||||
"""Check if a VM or CT exists (not just running, but exists at all).
|
||||
|
||||
Uses 'qm config' and 'pct config' which return success even for stopped VMs/CTs,
|
||||
but fail if the VM/CT doesn't exist.
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
# Try VM first
|
||||
result = subprocess.run(
|
||||
['qm', 'config', vmid],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=3
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return True
|
||||
|
||||
# Try CT
|
||||
result = subprocess.run(
|
||||
['pct', 'config', vmid],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=3
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return True
|
||||
|
||||
return False
|
||||
except Exception:
|
||||
# On error, assume it exists to avoid false positives
|
||||
return True
|
||||
|
||||
def check_vm_running(self, vm_id: str) -> bool:
|
||||
"""
|
||||
|
||||
@@ -28,7 +28,7 @@ from pathlib import Path
|
||||
|
||||
# ─── Shared State for Cross-Watcher Coordination ──────────────────
|
||||
|
||||
# ─── Startup Grace Period ─────────────────<EFBFBD><EFBFBD><EFBFBD>──────────────────────────────────
|
||||
# ─── Startup Grace Period ────────────────────────────────────────────────────
|
||||
# Import centralized startup grace management
|
||||
# This provides a single source of truth for all grace period logic
|
||||
import startup_grace
|
||||
@@ -2610,7 +2610,7 @@ class PollingCollector:
|
||||
pass
|
||||
|
||||
|
||||
# ─── Proxmox Webhook Receiver ───────────────────────────────────
|
||||
# ─── Proxmox Webhook Receiver ─────────────<EFBFBD><EFBFBD><EFBFBD>─────────────────────
|
||||
|
||||
class ProxmoxHookWatcher:
|
||||
"""Receives native Proxmox VE notifications via local webhook endpoint.
|
||||
|
||||
@@ -44,6 +44,13 @@ from notification_events import (
|
||||
ProxmoxHookWatcher,
|
||||
)
|
||||
|
||||
# AI context enrichment (uptime, frequency, SMART data, known errors)
|
||||
try:
|
||||
from ai_context_enrichment import enrich_context_for_ai
|
||||
except ImportError:
|
||||
def enrich_context_for_ai(title, body, event_type, data, journal_context='', detail_level='standard'):
|
||||
return journal_context
|
||||
|
||||
|
||||
# ─── Constants ────────────────────────────────────────────────────
|
||||
|
||||
@@ -743,10 +750,10 @@ class NotificationManager:
|
||||
'ai_custom_prompt': self._config.get('ai_custom_prompt', ''),
|
||||
}
|
||||
|
||||
# Get journal context if available
|
||||
journal_context = data.get('_journal_context', '')
|
||||
|
||||
for ch_name, channel in channels.items():
|
||||
# Get journal context if available (will be enriched per-channel based on detail_level)
|
||||
raw_journal_context = data.get('_journal_context', '')
|
||||
|
||||
for ch_name, channel in channels.items():
|
||||
# ── Per-channel category check ──
|
||||
# Default: category enabled (true) unless explicitly disabled.
|
||||
ch_group_key = f'{ch_name}.events.{event_group}'
|
||||
@@ -771,17 +778,28 @@ class NotificationManager:
|
||||
rich_key = f'{ch_name}.rich_format'
|
||||
use_rich_format = self._config.get(rich_key, 'false') == 'true'
|
||||
|
||||
# ── Per-channel AI enhancement ──
|
||||
# Apply AI with channel-specific detail level and emoji setting
|
||||
# If AI is enabled AND rich_format is on, AI will include emojis directly
|
||||
# Pass channel_type so AI knows whether to append original (email only)
|
||||
channel_ai_config = {**ai_config, 'channel_type': ch_name}
|
||||
ai_result = format_with_ai_full(
|
||||
ch_title, ch_body, severity, channel_ai_config,
|
||||
detail_level=detail_level,
|
||||
journal_context=journal_context,
|
||||
use_emojis=use_rich_format
|
||||
)
|
||||
# ── Per-channel AI enhancement ──
|
||||
# Apply AI with channel-specific detail level and emoji setting
|
||||
# If AI is enabled AND rich_format is on, AI will include emojis directly
|
||||
# Pass channel_type so AI knows whether to append original (email only)
|
||||
channel_ai_config = {**ai_config, 'channel_type': ch_name}
|
||||
|
||||
# Enrich context with uptime, frequency, SMART data, and known errors
|
||||
enriched_context = enrich_context_for_ai(
|
||||
title=ch_title,
|
||||
body=ch_body,
|
||||
event_type=event_type,
|
||||
data=data,
|
||||
journal_context=raw_journal_context,
|
||||
detail_level=detail_level
|
||||
)
|
||||
|
||||
ai_result = format_with_ai_full(
|
||||
ch_title, ch_body, severity, channel_ai_config,
|
||||
detail_level=detail_level,
|
||||
journal_context=enriched_context,
|
||||
use_emojis=use_rich_format
|
||||
)
|
||||
ch_title = ai_result.get('title', ch_title)
|
||||
ch_body = ai_result.get('body', ch_body)
|
||||
|
||||
|
||||
@@ -1384,7 +1384,13 @@ AI_DETAIL_TOKENS = {
|
||||
|
||||
# System prompt template - optimized hybrid version
|
||||
AI_SYSTEM_PROMPT = """You are a notification FORMATTER for ProxMenux Monitor (Proxmox VE).
|
||||
Your job: translate and reformat alerts into {language}. You are NOT an analyst — do not interpret or diagnose.
|
||||
Your job: translate alerts into {language} and enrich them with context when provided.
|
||||
|
||||
═══ ABSOLUTE CONSTRAINTS (NO EXCEPTIONS) ═══
|
||||
- NO HALLUCINATIONS: Do not invent causes, solutions, or facts not present in the provided data
|
||||
- NO SPECULATION: If something is unclear, state what IS known, not what MIGHT be
|
||||
- NO CONVERSATIONAL TEXT: Never write "Here is...", "I've translated...", "Let me explain..."
|
||||
- ONLY use information from: the message, journal context, and known error database (if provided)
|
||||
|
||||
═══ WHAT TO TRANSLATE ═══
|
||||
Translate: labels, descriptions, status words, units (GB→Go in French, etc.)
|
||||
@@ -1394,15 +1400,37 @@ DO NOT translate: hostnames, IPs, paths, VM/CT IDs, device names (/dev/sdX), tec
|
||||
1. Plain text only — NO markdown, no **bold**, no `code`, no bullet lists (use "• " for packages only)
|
||||
2. Preserve severity: "failed" stays "failed", "warning" stays "warning" — never soften errors
|
||||
3. Preserve structure: keep same fields and line order, only translate content
|
||||
4. Detail level "{detail_level}": brief (2-3 lines) | standard (short paragraph) | detailed (full report)
|
||||
4. Detail level "{detail_level}":
|
||||
- brief: 1-2 lines, essential facts only
|
||||
- standard: short paragraph, key details and context
|
||||
- detailed: full report with all available information, step-by-step if applicable
|
||||
5. DEDUPLICATION: merge duplicate facts from multiple sources into one clear statement
|
||||
6. EMPTY LISTS: write translated "none" after label, never leave blank
|
||||
7. Keep "hostname:" prefix in title — translate only the descriptive part
|
||||
8. DO NOT add recommendations or suggestions ("you should...", "try...", "consider...")
|
||||
{suggestions_addon}9. Present facts from message AND journal context — describe what happened, do NOT speculate
|
||||
10. OUTPUT ONLY the final result — no "Original:", no before/after comparisons
|
||||
11. Unknown input: preserve as closely as possible, translate what you can
|
||||
8. DO NOT add recommendations or suggestions UNLESS AI Suggestions mode is enabled below
|
||||
9. ENRICHED CONTEXT: You may receive additional context data including:
|
||||
- "System uptime: X days (stable system)" → helps distinguish startup issues from runtime failures
|
||||
- "Event frequency: N occurrences, first seen X ago" → indicates recurring vs one-time issues
|
||||
- "SMART Health: PASSED/FAILED" with disk attributes → critical for disk errors
|
||||
- "KNOWN PROXMOX ERROR DETECTED" with cause/solution → YOU MUST USE this exact information
|
||||
|
||||
How to use enriched context:
|
||||
- If uptime is <10min and error is service-related → mention "occurred shortly after boot"
|
||||
- If frequency shows recurring pattern → mention "recurring issue (N times in X hours)"
|
||||
- If SMART shows FAILED → treat as CRITICAL: "Disk failing - immediate attention required"
|
||||
- If KNOWN ERROR is provided → YOU MUST incorporate its Cause and Solution (translate, don't copy verbatim)
|
||||
|
||||
10. JOURNAL CONTEXT EXTRACTION: When journal logs are provided:
|
||||
- Extract specific IDs (VM/CT numbers, disk devices, service names)
|
||||
- Include relevant timestamps if they help explain the timeline
|
||||
- Identify root cause when logs clearly show it (e.g., "exit-code 255" -> "process crashed")
|
||||
- Translate technical terms: "Emask 0x10" -> "ATA bus error", "DRDY ERR" -> "drive not ready"
|
||||
- If logs show the same error repeating, state frequency: "occurred 15 times in 10 minutes"
|
||||
- IGNORE journal entries unrelated to the main event
|
||||
11. OUTPUT ONLY the final result — no "Original:", no before/after comparisons
|
||||
12. Unknown input: preserve as closely as possible, translate what you can
|
||||
13. REDUNDANCY: Never repeat the same information twice. If title says "CT 103 failed", body should not start with "Container 103 failed"
|
||||
{suggestions_addon}
|
||||
═══ PROXMOX MAPPINGS (use directly, never explain) ═══
|
||||
pve-container@XXXX → "CT XXXX" | qemu-server@XXXX → "VM XXXX" | vzdump → "backup"
|
||||
pveproxy/pvedaemon/pvestatd → "Proxmox service" | corosync → "cluster service"
|
||||
@@ -1457,18 +1485,17 @@ CORRECT (markers are separators only):
|
||||
|
||||
# Addon for experimental suggestions mode
|
||||
AI_SUGGESTIONS_ADDON = """
|
||||
EXCEPTION TO RULE 8 (Suggestions enabled): When journal context shows a clear, actionable problem,
|
||||
you MAY add ONE brief suggestion at the END of the body (after all facts), using this format:
|
||||
|
||||
💡 Tip: [your suggestion here]
|
||||
|
||||
Guidelines for suggestions:
|
||||
- Only suggest when the problem AND solution are clear from the logs
|
||||
- Keep it to ONE line, max 100 characters
|
||||
- Be specific: "Check disk /dev/sdb SMART status" not "Check your disks"
|
||||
- Use commands when helpful: "Run 'systemctl restart pvedaemon'"
|
||||
- Never speculate - only suggest based on evidence in the logs
|
||||
- Skip the tip entirely if the problem is unclear or already resolved
|
||||
═══ AI SUGGESTIONS MODE (ENABLED) ═══
|
||||
You MAY add ONE brief, actionable tip at the END of the body using this exact format:
|
||||
|
||||
💡 Tip: [your concise suggestion here]
|
||||
|
||||
Rules for the tip:
|
||||
- ONLY include if the log context or Known Error database clearly points to a specific fix
|
||||
- Keep under 100 characters
|
||||
- Be specific: "Run 'pvecm status' to check quorum" NOT "Check cluster status"
|
||||
- If Known Error provides a solution, YOU MUST USE IT (don't invent your own)
|
||||
- Never guess — skip the tip if the cause/solution is unclear
|
||||
"""
|
||||
|
||||
# Emoji instructions injected into AI_SYSTEM_PROMPT for rich channels (Telegram, Discord, Pushover)
|
||||
|
||||
348
AppImage/scripts/proxmox_known_errors.py
Normal file
348
AppImage/scripts/proxmox_known_errors.py
Normal file
@@ -0,0 +1,348 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Database of known Proxmox/Linux errors with causes, solutions, and severity levels.
|
||||
|
||||
This provides the AI with accurate, pre-verified information about common errors,
|
||||
reducing hallucinations and ensuring consistent, helpful responses.
|
||||
|
||||
Each entry includes:
|
||||
- pattern: regex pattern to match against error messages/logs
|
||||
- cause: brief explanation of what causes this error
|
||||
- cause_detailed: more comprehensive explanation for detailed mode
|
||||
- severity: info, warning, critical
|
||||
- solution: brief actionable solution
|
||||
- solution_detailed: step-by-step solution for detailed mode
|
||||
- url: optional documentation link
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
# Known error patterns with causes and solutions
|
||||
PROXMOX_KNOWN_ERRORS: List[Dict[str, Any]] = [
|
||||
# ==================== SUBSCRIPTION/LICENSE ====================
|
||||
{
|
||||
"pattern": r"no valid subscription|subscription.*invalid|not subscribed",
|
||||
"cause": "Proxmox enterprise repository requires paid subscription",
|
||||
"cause_detailed": "Proxmox VE uses a subscription model for enterprise features. Without a valid subscription key, access to the enterprise repository is denied. This is normal for home/lab users.",
|
||||
"severity": "info",
|
||||
"solution": "Use no-subscription repository or purchase subscription",
|
||||
"solution_detailed": "For home/lab use: Switch to the no-subscription repository by editing /etc/apt/sources.list.d/pve-enterprise.list. For production: Purchase a subscription at proxmox.com/pricing",
|
||||
"url": "https://pve.proxmox.com/wiki/Package_Repositories",
|
||||
"category": "updates"
|
||||
},
|
||||
|
||||
# ==================== CLUSTER/COROSYNC ====================
|
||||
{
|
||||
"pattern": r"quorum.*lost|lost.*quorum|not.*quorate",
|
||||
"cause": "Cluster lost majority of voting nodes",
|
||||
"cause_detailed": "Corosync cluster requires more than 50% of configured votes to maintain quorum. When quorum is lost, the cluster becomes read-only to prevent split-brain scenarios.",
|
||||
"severity": "critical",
|
||||
"solution": "Check network connectivity between nodes; ensure majority of nodes are online",
|
||||
"solution_detailed": "1. Verify network connectivity: ping all cluster nodes\n2. Check corosync status: systemctl status corosync\n3. View cluster status: pvecm status\n4. If nodes are unreachable, check firewall rules (ports 5405-5412 UDP)\n5. For emergency single-node operation: pvecm expected 1",
|
||||
"url": "https://pve.proxmox.com/wiki/Cluster_Manager",
|
||||
"category": "cluster"
|
||||
},
|
||||
{
|
||||
"pattern": r"corosync.*qdevice.*error|qdevice.*connection.*failed|qdevice.*not.*connected",
|
||||
"cause": "QDevice helper node is unreachable",
|
||||
"cause_detailed": "The Corosync QDevice provides an additional vote for 2-node clusters. When it cannot connect, the cluster may lose quorum if one node fails.",
|
||||
"severity": "warning",
|
||||
"solution": "Check QDevice server connectivity and corosync-qnetd service",
|
||||
"solution_detailed": "1. Verify QDevice server is running: systemctl status corosync-qnetd (on QDevice host)\n2. Check connectivity: nc -zv <qdevice-ip> 5403\n3. Restart qdevice: systemctl restart corosync-qdevice\n4. Check certificates: corosync-qdevice-net-certutil -s",
|
||||
"url": "https://pve.proxmox.com/wiki/Cluster_Manager#_corosync_external_vote_support",
|
||||
"category": "cluster"
|
||||
},
|
||||
{
|
||||
"pattern": r"corosync.*retransmit|corosync.*token.*timeout|ring.*mark.*faulty",
|
||||
"cause": "Network latency or packet loss between cluster nodes",
|
||||
"cause_detailed": "Corosync uses multicast/unicast for cluster communication. High latency, packet loss, or network congestion causes token timeouts and retransmissions, potentially leading to node eviction.",
|
||||
"severity": "warning",
|
||||
"solution": "Check network quality between nodes; consider increasing token timeout",
|
||||
"solution_detailed": "1. Test network latency: ping -c 100 <other-node>\n2. Check for packet loss between nodes\n3. Verify MTU settings match on all interfaces\n4. Increase token timeout in /etc/pve/corosync.conf if needed (default 1000ms)\n5. Check switch/router for congestion",
|
||||
"category": "cluster"
|
||||
},
|
||||
|
||||
# ==================== DISK/STORAGE ====================
|
||||
{
|
||||
"pattern": r"SMART.*FAILED|smart.*failed.*health|Pre-fail|Old_age.*FAILING",
|
||||
"cause": "Disk SMART health check failed - disk is failing",
|
||||
"cause_detailed": "SMART (Self-Monitoring, Analysis and Reporting Technology) detected critical disk health issues. The disk is likely failing and data loss is imminent.",
|
||||
"severity": "critical",
|
||||
"solution": "IMMEDIATELY backup data and replace disk",
|
||||
"solution_detailed": "1. URGENT: Backup all data from this disk immediately\n2. Check SMART details: smartctl -a /dev/sdX\n3. Note the failing attributes (Reallocated_Sector_Ct, Current_Pending_Sector, etc.)\n4. Plan disk replacement\n5. If in RAID/ZFS: initiate disk replacement procedure",
|
||||
"category": "disks"
|
||||
},
|
||||
{
|
||||
"pattern": r"Reallocated_Sector_Ct.*threshold|reallocated.*sectors?.*exceeded",
|
||||
"cause": "Disk has excessive bad sectors being remapped",
|
||||
"cause_detailed": "The disk firmware has remapped multiple bad sectors to spare areas. While the disk is still functioning, this indicates physical degradation and eventual failure.",
|
||||
"severity": "warning",
|
||||
"solution": "Monitor closely and plan disk replacement",
|
||||
"solution_detailed": "1. Check current value: smartctl -A /dev/sdX | grep Reallocated\n2. If value is increasing, plan immediate replacement\n3. Backup important data\n4. Run extended SMART test: smartctl -t long /dev/sdX",
|
||||
"category": "disks"
|
||||
},
|
||||
{
|
||||
"pattern": r"ata.*error|ATA.*bus.*error|Emask.*0x|DRDY.*ERR|UNC.*error",
|
||||
"cause": "ATA communication error with disk",
|
||||
"cause_detailed": "The SATA/ATA controller encountered communication errors with the disk. This can indicate cable issues, controller problems, or disk failure.",
|
||||
"severity": "warning",
|
||||
"solution": "Check SATA cables and connections; verify disk health with smartctl",
|
||||
"solution_detailed": "1. Check SMART health: smartctl -H /dev/sdX\n2. Inspect and reseat SATA cables\n3. Try different SATA port\n4. Check dmesg for pattern of errors\n5. If errors persist, disk may be failing",
|
||||
"category": "disks"
|
||||
},
|
||||
{
|
||||
"pattern": r"I/O.*error|blk_update_request.*error|Buffer I/O error",
|
||||
"cause": "Disk I/O operation failed",
|
||||
"cause_detailed": "The kernel failed to read or write data to the disk. This can be caused by disk failure, cable issues, or filesystem corruption.",
|
||||
"severity": "critical",
|
||||
"solution": "Check disk health and connections immediately",
|
||||
"solution_detailed": "1. Check SMART status: smartctl -H /dev/sdX\n2. Check dmesg for related errors: dmesg | grep -i error\n3. Verify disk is still accessible: lsblk\n4. If ZFS: check pool status with zpool status\n5. Consider filesystem check if safe to unmount",
|
||||
"category": "disks"
|
||||
},
|
||||
{
|
||||
"pattern": r"zfs.*pool.*DEGRADED|pool.*is.*degraded",
|
||||
"cause": "ZFS pool has reduced redundancy",
|
||||
"cause_detailed": "One or more devices in the ZFS pool are unavailable or experiencing errors. The pool is still functional but without full redundancy.",
|
||||
"severity": "warning",
|
||||
"solution": "Identify failed device with 'zpool status' and replace",
|
||||
"solution_detailed": "1. Check pool status: zpool status <pool>\n2. Identify the DEGRADED or UNAVAIL device\n3. If device is present but erroring: zpool scrub <pool>\n4. To replace: zpool replace <pool> <old-device> <new-device>\n5. Monitor resilver progress: zpool status",
|
||||
"category": "storage"
|
||||
},
|
||||
{
|
||||
"pattern": r"zfs.*pool.*FAULTED|pool.*is.*faulted",
|
||||
"cause": "ZFS pool is inaccessible",
|
||||
"cause_detailed": "The ZFS pool has lost too many devices and cannot maintain data integrity. Data may be inaccessible.",
|
||||
"severity": "critical",
|
||||
"solution": "Check failed devices; may need data recovery",
|
||||
"solution_detailed": "1. Check status: zpool status <pool>\n2. Identify all failed devices\n3. Attempt to online devices: zpool online <pool> <device>\n4. If drives are physically present, try zpool clear <pool>\n5. May require data recovery if multiple drives failed",
|
||||
"category": "storage"
|
||||
},
|
||||
|
||||
# ==================== CEPH ====================
|
||||
{
|
||||
"pattern": r"ceph.*OSD.*down|osd\.\d+.*down|ceph.*osd.*failed",
|
||||
"cause": "Ceph OSD daemon is not running",
|
||||
"cause_detailed": "A Ceph Object Storage Daemon (OSD) has stopped or crashed. This reduces storage redundancy and may trigger data rebalancing.",
|
||||
"severity": "warning",
|
||||
"solution": "Check disk health and restart OSD service",
|
||||
"solution_detailed": "1. Check OSD status: ceph osd tree\n2. View OSD logs: journalctl -u ceph-osd@<id>\n3. Check underlying disk: smartctl -H /dev/sdX\n4. Restart OSD: systemctl start ceph-osd@<id>\n5. If OSD keeps crashing, check for disk failure",
|
||||
"category": "storage"
|
||||
},
|
||||
{
|
||||
"pattern": r"ceph.*health.*WARN|HEALTH_WARN",
|
||||
"cause": "Ceph cluster has warnings",
|
||||
"cause_detailed": "Ceph detected issues that don't prevent operation but should be addressed. Common causes: degraded PGs, clock skew, full OSDs.",
|
||||
"severity": "warning",
|
||||
"solution": "Run 'ceph health detail' for specific issues",
|
||||
"solution_detailed": "1. Get details: ceph health detail\n2. Common fixes:\n - Degraded PGs: wait for recovery or add capacity\n - Clock skew: sync NTP on all nodes\n - Full OSDs: add storage or delete data\n3. Check: ceph status",
|
||||
"category": "storage"
|
||||
},
|
||||
{
|
||||
"pattern": r"ceph.*health.*ERR|HEALTH_ERR",
|
||||
"cause": "Ceph cluster has critical errors",
|
||||
"cause_detailed": "Ceph has detected critical issues that may affect data availability or integrity. Immediate attention required.",
|
||||
"severity": "critical",
|
||||
"solution": "Run 'ceph health detail' and address errors immediately",
|
||||
"solution_detailed": "1. Get details: ceph health detail\n2. Check OSD status: ceph osd tree\n3. Check MON status: ceph mon stat\n4. View PG status: ceph pg stat\n5. Address each error shown in health detail",
|
||||
"category": "storage"
|
||||
},
|
||||
|
||||
# ==================== VM/CT ERRORS ====================
|
||||
{
|
||||
"pattern": r"TASK ERROR.*failed to get exclusive lock|lock.*timeout|couldn't acquire lock",
|
||||
"cause": "Resource is locked by another operation",
|
||||
"cause_detailed": "Another task is currently holding a lock on this VM/CT. This prevents concurrent modifications that could cause corruption.",
|
||||
"severity": "info",
|
||||
"solution": "Wait for other task to complete or check for stuck tasks",
|
||||
"solution_detailed": "1. Check running tasks: cat /var/log/pve/tasks/active\n2. Wait for task completion\n3. If task is stuck (>1h), check process: ps aux | grep <vmid>\n4. As last resort, remove lock file: rm /var/lock/qemu-server/lock-<vmid>.conf",
|
||||
"category": "vms"
|
||||
},
|
||||
{
|
||||
"pattern": r"kvm.*not.*available|kvm.*disabled|hardware.*virtualization.*disabled",
|
||||
"cause": "KVM/hardware virtualization not available",
|
||||
"cause_detailed": "The CPU's hardware virtualization extensions (Intel VT-x or AMD-V) are either not supported, not enabled in BIOS, or blocked by another hypervisor.",
|
||||
"severity": "warning",
|
||||
"solution": "Enable VT-x/AMD-V in BIOS settings",
|
||||
"solution_detailed": "1. Reboot into BIOS/UEFI\n2. Find Virtualization settings (often in CPU or Advanced section)\n3. Enable Intel VT-x or AMD-V/SVM\n4. Save and reboot\n5. Verify: grep -E 'vmx|svm' /proc/cpuinfo",
|
||||
"category": "vms"
|
||||
},
|
||||
{
|
||||
"pattern": r"out of memory|OOM.*kill|cannot allocate memory|memory.*exhausted",
|
||||
"cause": "System or VM ran out of memory",
|
||||
"cause_detailed": "The Linux OOM (Out Of Memory) killer terminated a process to free memory. This indicates memory pressure from overcommitment or memory leaks.",
|
||||
"severity": "critical",
|
||||
"solution": "Increase memory allocation or reduce VM memory usage",
|
||||
"solution_detailed": "1. Check what was killed: dmesg | grep -i oom\n2. Review memory usage: free -h\n3. Check balloon driver status for VMs\n4. Consider adding swap or RAM\n5. Review VM memory allocations for overcommitment",
|
||||
"category": "memory"
|
||||
},
|
||||
|
||||
# ==================== NETWORK ====================
|
||||
{
|
||||
"pattern": r"bond.*slave.*link.*down|bond.*no.*active.*slave",
|
||||
"cause": "Network bond lost a slave interface",
|
||||
"cause_detailed": "One or more physical interfaces in a network bond have lost link. Depending on bond mode, this may reduce bandwidth or affect failover.",
|
||||
"severity": "warning",
|
||||
"solution": "Check physical cable connections and switch ports",
|
||||
"solution_detailed": "1. Check bond status: cat /proc/net/bonding/bond0\n2. Identify down slave interface\n3. Check physical cable connection\n4. Check switch port status and errors\n5. Verify interface: ethtool <slave-iface>",
|
||||
"category": "network"
|
||||
},
|
||||
{
|
||||
"pattern": r"link.*not.*ready|carrier.*lost|link.*down|NIC.*Link.*Down",
|
||||
"cause": "Network interface lost link",
|
||||
"cause_detailed": "The physical or virtual network interface has lost its connection. This could be a cable issue, switch problem, or driver issue.",
|
||||
"severity": "warning",
|
||||
"solution": "Check cable, switch port, and interface status",
|
||||
"solution_detailed": "1. Check interface: ip link show <iface>\n2. Check cable connection\n3. Check switch port LEDs\n4. Try: ip link set <iface> down && ip link set <iface> up\n5. Check driver: ethtool -i <iface>",
|
||||
"category": "network"
|
||||
},
|
||||
{
|
||||
"pattern": r"bridge.*STP.*blocked|spanning.*tree.*blocked",
|
||||
"cause": "Spanning Tree Protocol blocked a port",
|
||||
"cause_detailed": "STP detected a potential network loop and blocked a bridge port to prevent broadcast storms. This is normal behavior but may indicate network topology issues.",
|
||||
"severity": "info",
|
||||
"solution": "Review network topology; this may be expected behavior",
|
||||
"solution_detailed": "1. Check bridge status: brctl show\n2. View STP state: brctl showstp <bridge>\n3. If unexpected, review network topology for loops\n4. Consider disabling STP if network is simple: brctl stp <bridge> off",
|
||||
"category": "network"
|
||||
},
|
||||
|
||||
# ==================== SERVICES ====================
|
||||
{
|
||||
"pattern": r"pvedaemon.*failed|pveproxy.*failed|pvestatd.*failed",
|
||||
"cause": "Critical Proxmox service failed",
|
||||
"cause_detailed": "One of the core Proxmox daemons has crashed or failed to start. This may affect web GUI access or API functionality.",
|
||||
"severity": "critical",
|
||||
"solution": "Restart the failed service; check logs for cause",
|
||||
"solution_detailed": "1. Check status: systemctl status <service>\n2. View logs: journalctl -u <service> -n 50\n3. Restart: systemctl restart <service>\n4. If persistent, check: /var/log/pveproxy/access.log",
|
||||
"category": "pve_services"
|
||||
},
|
||||
{
|
||||
"pattern": r"failed to start.*service|service.*start.*failed|service.*activation.*failed",
|
||||
"cause": "System service failed to start",
|
||||
"cause_detailed": "A systemd service unit failed during startup. This could be due to configuration errors, missing dependencies, or resource issues.",
|
||||
"severity": "warning",
|
||||
"solution": "Check service logs with journalctl -u <service>",
|
||||
"solution_detailed": "1. Check status: systemctl status <service>\n2. View logs: journalctl -xeu <service>\n3. Check config: systemctl cat <service>\n4. Verify dependencies: systemctl list-dependencies <service>\n5. Try restart: systemctl restart <service>",
|
||||
"category": "services"
|
||||
},
|
||||
|
||||
# ==================== BACKUP ====================
|
||||
{
|
||||
"pattern": r"backup.*failed|vzdump.*error|backup.*job.*failed",
|
||||
"cause": "Backup job failed",
|
||||
"cause_detailed": "A scheduled or manual backup operation failed. Common causes: storage full, VM locked, network issues for remote storage.",
|
||||
"severity": "warning",
|
||||
"solution": "Check backup storage space and VM status",
|
||||
"solution_detailed": "1. Check backup log in Datacenter > Backup\n2. Verify storage space: df -h\n3. Check if VM is locked: qm list or pct list\n4. Verify backup storage is accessible\n5. Try manual backup to identify specific error",
|
||||
"category": "backups"
|
||||
},
|
||||
|
||||
# ==================== CERTIFICATES ====================
|
||||
{
|
||||
"pattern": r"certificate.*expired|SSL.*certificate.*expired|cert.*expir",
|
||||
"cause": "SSL/TLS certificate has expired",
|
||||
"cause_detailed": "An SSL certificate used for secure communication has passed its expiration date. This may cause connection failures or security warnings.",
|
||||
"severity": "warning",
|
||||
"solution": "Renew the certificate using pvenode cert set or Let's Encrypt",
|
||||
"solution_detailed": "1. Check certificate: pvenode cert info\n2. For self-signed renewal: pvecm updatecerts\n3. For Let's Encrypt: pvenode acme cert order\n4. Restart pveproxy after renewal: systemctl restart pveproxy",
|
||||
"url": "https://pve.proxmox.com/wiki/Certificate_Management",
|
||||
"category": "security"
|
||||
},
|
||||
|
||||
# ==================== HARDWARE/TEMPERATURE ====================
|
||||
{
|
||||
"pattern": r"temperature.*critical|thermal.*critical|CPU.*overheating|temp.*above.*threshold",
|
||||
"cause": "Component temperature critical",
|
||||
"cause_detailed": "A hardware component (CPU, disk, etc.) has reached a dangerous temperature. Sustained high temperatures can cause hardware damage or system shutdowns.",
|
||||
"severity": "critical",
|
||||
"solution": "Check cooling system immediately; clean dust, verify fans",
|
||||
"solution_detailed": "1. Check current temps: sensors\n2. Verify all fans are running\n3. Clean dust from heatsinks and filters\n4. Ensure adequate airflow\n5. Consider reapplying thermal paste if CPU\n6. Check ambient room temperature",
|
||||
"category": "temperature"
|
||||
},
|
||||
|
||||
# ==================== AUTHENTICATION ====================
|
||||
{
|
||||
"pattern": r"authentication.*failed|login.*failed|invalid.*credentials|access.*denied",
|
||||
"cause": "Authentication failure",
|
||||
"cause_detailed": "A login attempt failed due to invalid credentials or permissions. Multiple failures may indicate a brute-force attack.",
|
||||
"severity": "info",
|
||||
"solution": "Verify credentials; check for unauthorized access attempts",
|
||||
"solution_detailed": "1. Review auth logs: journalctl -u pvedaemon | grep auth\n2. Check for multiple failures from same IP\n3. Verify user exists: pveum user list\n4. If attack suspected, consider fail2ban\n5. Reset password if needed: pveum passwd <user>",
|
||||
"category": "security"
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def find_matching_error(text: str, category: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
||||
"""Find a known error that matches the given text.
|
||||
|
||||
Args:
|
||||
text: Error message or log content to match against
|
||||
category: Optional category to filter by
|
||||
|
||||
Returns:
|
||||
Matching error dict or None
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
text_lower = text.lower()
|
||||
|
||||
for error in PROXMOX_KNOWN_ERRORS:
|
||||
# Filter by category if specified
|
||||
if category and error.get("category") != category:
|
||||
continue
|
||||
|
||||
try:
|
||||
if re.search(error["pattern"], text_lower, re.IGNORECASE):
|
||||
return error
|
||||
except re.error:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_error_context(text: str, category: Optional[str] = None, detail_level: str = "standard") -> Optional[str]:
|
||||
"""Get formatted context for a known error.
|
||||
|
||||
Args:
|
||||
text: Error message to match
|
||||
category: Optional category filter
|
||||
detail_level: "minimal", "standard", or "detailed"
|
||||
|
||||
Returns:
|
||||
Formatted context string or None
|
||||
"""
|
||||
error = find_matching_error(text, category)
|
||||
if not error:
|
||||
return None
|
||||
|
||||
if detail_level == "minimal":
|
||||
return f"Known issue: {error['cause']}"
|
||||
|
||||
elif detail_level == "standard":
|
||||
lines = [
|
||||
f"KNOWN PROXMOX ERROR DETECTED:",
|
||||
f" Cause: {error['cause']}",
|
||||
f" Severity: {error['severity'].upper()}",
|
||||
f" Solution: {error['solution']}"
|
||||
]
|
||||
if error.get("url"):
|
||||
lines.append(f" Docs: {error['url']}")
|
||||
return "\n".join(lines)
|
||||
|
||||
else: # detailed
|
||||
lines = [
|
||||
f"KNOWN PROXMOX ERROR DETECTED:",
|
||||
f" Cause: {error.get('cause_detailed', error['cause'])}",
|
||||
f" Severity: {error['severity'].upper()}",
|
||||
f" Solution: {error.get('solution_detailed', error['solution'])}"
|
||||
]
|
||||
if error.get("url"):
|
||||
lines.append(f" Documentation: {error['url']}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def get_all_patterns() -> List[str]:
|
||||
"""Get all error patterns for external use."""
|
||||
return [error["pattern"] for error in PROXMOX_KNOWN_ERRORS]
|
||||
Reference in New Issue
Block a user