update ai_context_enrichment.py

This commit is contained in:
MacRimi
2026-04-02 16:59:09 +02:00
parent 873c77d659
commit e11daa0b36
3 changed files with 89 additions and 56 deletions

View File

@@ -304,26 +304,22 @@ def enrich_context_for_ai(
context_parts = []
combined_text = f"{title} {body} {journal_context}"
# 1. System uptime - only relevant for failure/error events, not informational
# 1. System uptime - ONLY for critical system-level failures
# Uptime helps distinguish startup issues from runtime failures
# Only include uptime when something FAILED or has CRITICAL/WARNING status
uptime_relevant_types = [
'fail', 'error', 'critical', 'crash', 'panic', 'oom',
'disk_error', 'smart_error', 'io_error', 'service_fail',
'split_brain', 'quorum_lost', 'node_offline'
]
# Exclude informational events (success, start, stop, complete, etc.)
informational_types = [
'update', 'upgrade', 'available', 'info', 'resolved',
'start', 'stop', 'shutdown', 'restart', 'complete',
'backup_complete', 'backup_start', 'migration'
# BUT it's noise for disk errors, warnings, or routine operations
# Only include for: system crash, kernel panic, OOM, cluster failures
uptime_critical_types = [
'crash', 'panic', 'oom', 'kernel',
'split_brain', 'quorum_lost', 'node_offline', 'node_fail',
'system_fail', 'boot_fail'
]
is_uptime_relevant = any(t in event_type.lower() for t in uptime_relevant_types)
is_informational = any(t in event_type.lower() for t in informational_types)
# Check if this is a critical system-level event (not disk/service/hardware)
event_lower = event_type.lower()
is_critical_system_event = any(t in event_lower for t in uptime_critical_types)
# Only add uptime for actual failures, not routine operations
if is_uptime_relevant and not is_informational:
# Only add uptime for critical system failures, nothing else
if is_critical_system_event:
uptime = get_system_uptime()
if uptime and uptime != "unknown":
context_parts.append(f"System uptime: {uptime}")

View File

@@ -6192,6 +6192,8 @@ def api_network_interface_metrics(interface_name):
rrd_data = []
rrd_error = None
if interface_type == 'vm_lxc':
# For VM/LXC interfaces, get data from the VM/LXC RRD
vmid, vm_type = extract_vmid_from_interface(interface_name)
@@ -6202,19 +6204,20 @@ def api_network_interface_metrics(interface_name):
capture_output=True, text=True, timeout=10)
if rrd_result.returncode == 0:
all_data = json.loads(rrd_result.stdout)
# Filter to only network-related fields
for point in all_data:
filtered_point = {'time': point.get('time')}
# Add network fields if they exist
for key in ['netin', 'netout']:
if key in point:
filtered_point[key] = point[key]
rrd_data.append(filtered_point)
try:
all_data = json.loads(rrd_result.stdout)
# Filter to only network-related fields
for point in all_data:
filtered_point = {'time': point.get('time')}
# Add network fields if they exist
for key in ['netin', 'netout']:
if key in point:
filtered_point[key] = point[key]
rrd_data.append(filtered_point)
except json.JSONDecodeError:
rrd_error = f'RRD data for {vm_type.upper()} {vmid} is empty or corrupted'
else:
# print(f"[v0] ERROR: Failed to get RRD data for VM/LXC")
pass
rrd_error = f'Failed to get RRD data: {rrd_result.stderr}'
else:
# For physical/bridge interfaces, get data from node RRD
@@ -6223,38 +6226,42 @@ def api_network_interface_metrics(interface_name):
capture_output=True, text=True, timeout=10)
if rrd_result.returncode == 0:
all_data = json.loads(rrd_result.stdout)
# Filter to only network-related fields for this interface
for point in all_data:
filtered_point = {'time': point.get('time')}
# Add network fields if they exist
for key in ['netin', 'netout']:
if key in point:
filtered_point[key] = point[key]
rrd_data.append(filtered_point)
try:
all_data = json.loads(rrd_result.stdout)
# Filter to only network-related fields for this interface
for point in all_data:
filtered_point = {'time': point.get('time')}
# Add network fields if they exist
for key in ['netin', 'netout']:
if key in point:
filtered_point[key] = point[key]
rrd_data.append(filtered_point)
except json.JSONDecodeError:
rrd_error = 'Node RRD data is empty or corrupted'
else:
# print(f"[v0] ERROR: Failed to get RRD data for node")
pass
rrd_error = f'Failed to get RRD data: {rrd_result.stderr}'
# If there was an RRD error and no data collected, return error with details
if rrd_error and not rrd_data:
return jsonify({
'error': 'RRD data not available',
'details': rrd_error,
'suggestion': 'The RRD database may be empty or corrupted. Try: systemctl restart rrdcached'
}), 503
return jsonify({
'interface': interface_name,
'type': interface_type,
'timeframe': timeframe,
'data': rrd_data
'data': rrd_data,
'warning': rrd_error if rrd_error else None # Include warning if there was an error but some data exists
})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/api/vms', methods=['GET'])
@require_auth
def api_vms():
"""Get virtual machine information"""
return jsonify(get_proxmox_vms())
@app.route('/api/vms/<int:vmid>/metrics', methods=['GET'])
@require_auth
def api_vm_metrics(vmid):
@@ -6316,9 +6323,22 @@ def api_vm_metrics(vmid):
'data': rrd_data
})
else:
# Check if RRD file is empty or corrupted
stderr_lower = rrd_result.stderr.lower() if rrd_result.stderr else ''
if 'rrd' in stderr_lower or 'no such file' in stderr_lower or 'empty' in stderr_lower:
return jsonify({
'error': 'RRD data not available',
'details': f'The RRD database for {vm_type.upper()} {vmid} may be empty or corrupted.',
'suggestion': 'Try restarting rrdcached: systemctl restart rrdcached'
}), 503
return jsonify({'error': f'Failed to get RRD data: {rrd_result.stderr}'}), 500
except json.JSONDecodeError:
return jsonify({
'error': 'RRD data not available',
'details': f'Unable to parse metrics data for VM/LXC {vmid}.',
'suggestion': 'Try restarting rrdcached: systemctl restart rrdcached'
}), 503
except Exception as e:
return jsonify({'error': str(e)}), 500
@@ -6381,8 +6401,23 @@ def api_node_metrics():
'data': rrd_data
})
else:
# Check if RRD file is empty or corrupted
stderr_lower = rrd_result.stderr.lower() if rrd_result.stderr else ''
if 'rrd' in stderr_lower or 'no such file' in stderr_lower or 'empty' in stderr_lower:
return jsonify({
'error': 'RRD data not available',
'details': 'The RRD database file may be empty or corrupted. This can happen if rrdcached was not running properly after Proxmox installation.',
'suggestion': 'Try restarting rrdcached: systemctl restart rrdcached'
}), 503 # Service Unavailable - more appropriate than 500
return jsonify({'error': f'Failed to get RRD data: {rrd_result.stderr}'}), 500
except json.JSONDecodeError:
# pvesh returned invalid JSON - likely empty RRD
return jsonify({
'error': 'RRD data not available',
'details': 'Unable to parse metrics data. The RRD database may be empty or corrupted.',
'suggestion': 'Try restarting rrdcached: systemctl restart rrdcached'
}), 503
except Exception as e:
return jsonify({'error': str(e)}), 500

View File

@@ -1001,7 +1001,7 @@ EVENT_GROUPS = {
}
# ─── Template Renderer ──────────────────────────────────────────
# ─── Template Renderer ─<EFBFBD><EFBFBD>─────────────────────────────────────────
def _get_hostname() -> str:
"""Get short hostname for message titles."""
@@ -1622,9 +1622,11 @@ BLANK LINES: Insert between logical sections (VM entries, before summary, before
═══ EXAMPLES (follow these formats) ═══
IMPORTANT: {hostname} is a placeholder. Always use the ACTUAL hostname from the original message.
BACKUP START:
[TITLE]
💾🚀 pve01: Backup started
💾🚀 {hostname}: Backup started
[BODY]
Backup job starting on storage PBS.
🏷️ VMs: web01 (100)
@@ -1633,7 +1635,7 @@ Backup job starting on storage PBS.
BACKUP COMPLETE:
[TITLE]
💾✅ pve01: Backup complete
💾✅ {hostname}: Backup complete
[BODY]
Backup job finished on storage local-bak.
@@ -1647,7 +1649,7 @@ Backup job finished on storage local-bak.
BACKUP PARTIAL FAIL:
[TITLE]
💾❌ pve01: Backup partially failed
💾❌ {hostname}: Backup partially failed
[BODY]
Backup job finished with errors.
@@ -1662,7 +1664,7 @@ Backup job finished with errors.
UPDATES:
[TITLE]
📦 amd: Updates available
📦 {hostname}: Updates available
[BODY]
📦 Total updates: 24
🔒 Security updates: 6
@@ -1673,14 +1675,14 @@ UPDATES:
VM/CT START:
[TITLE]
🚀 pve01: VM arch-linux (100) started
🚀 {hostname}: VM arch-linux (100) started
[BODY]
🏷️ Virtual machine arch-linux (ID: 100)
✔️ Now running
HEALTH DEGRADED:
[TITLE]
⚠️ amd: Health warning — Disk I/O
⚠️ {hostname}: Health warning — Disk I/O
[BODY]
💿 Device: /dev/sda
⚠️ 1 sector unreadable (pending)