From e11daa0b362a0ad78802b6a96f51b24bda0372d4 Mon Sep 17 00:00:00 2001 From: MacRimi Date: Thu, 2 Apr 2026 16:59:09 +0200 Subject: [PATCH] update ai_context_enrichment.py --- AppImage/scripts/ai_context_enrichment.py | 28 +++--- AppImage/scripts/flask_server.py | 101 ++++++++++++++------- AppImage/scripts/notification_templates.py | 16 ++-- 3 files changed, 89 insertions(+), 56 deletions(-) diff --git a/AppImage/scripts/ai_context_enrichment.py b/AppImage/scripts/ai_context_enrichment.py index 64fc78f7..f07d3ffa 100644 --- a/AppImage/scripts/ai_context_enrichment.py +++ b/AppImage/scripts/ai_context_enrichment.py @@ -304,26 +304,22 @@ def enrich_context_for_ai( context_parts = [] combined_text = f"{title} {body} {journal_context}" - # 1. System uptime - only relevant for failure/error events, not informational + # 1. System uptime - ONLY for critical system-level failures # Uptime helps distinguish startup issues from runtime failures - # Only include uptime when something FAILED or has CRITICAL/WARNING status - uptime_relevant_types = [ - 'fail', 'error', 'critical', 'crash', 'panic', 'oom', - 'disk_error', 'smart_error', 'io_error', 'service_fail', - 'split_brain', 'quorum_lost', 'node_offline' - ] - # Exclude informational events (success, start, stop, complete, etc.) - informational_types = [ - 'update', 'upgrade', 'available', 'info', 'resolved', - 'start', 'stop', 'shutdown', 'restart', 'complete', - 'backup_complete', 'backup_start', 'migration' + # BUT it's noise for disk errors, warnings, or routine operations + # Only include for: system crash, kernel panic, OOM, cluster failures + uptime_critical_types = [ + 'crash', 'panic', 'oom', 'kernel', + 'split_brain', 'quorum_lost', 'node_offline', 'node_fail', + 'system_fail', 'boot_fail' ] - is_uptime_relevant = any(t in event_type.lower() for t in uptime_relevant_types) - is_informational = any(t in event_type.lower() for t in informational_types) + # Check if this is a critical system-level event (not disk/service/hardware) + event_lower = event_type.lower() + is_critical_system_event = any(t in event_lower for t in uptime_critical_types) - # Only add uptime for actual failures, not routine operations - if is_uptime_relevant and not is_informational: + # Only add uptime for critical system failures, nothing else + if is_critical_system_event: uptime = get_system_uptime() if uptime and uptime != "unknown": context_parts.append(f"System uptime: {uptime}") diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index 0e53c4aa..a6c90f15 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -6192,6 +6192,8 @@ def api_network_interface_metrics(interface_name): rrd_data = [] + rrd_error = None + if interface_type == 'vm_lxc': # For VM/LXC interfaces, get data from the VM/LXC RRD vmid, vm_type = extract_vmid_from_interface(interface_name) @@ -6202,19 +6204,20 @@ def api_network_interface_metrics(interface_name): capture_output=True, text=True, timeout=10) if rrd_result.returncode == 0: - all_data = json.loads(rrd_result.stdout) - # Filter to only network-related fields - for point in all_data: - filtered_point = {'time': point.get('time')} - # Add network fields if they exist - for key in ['netin', 'netout']: - if key in point: - filtered_point[key] = point[key] - rrd_data.append(filtered_point) - + try: + all_data = json.loads(rrd_result.stdout) + # Filter to only network-related fields + for point in all_data: + filtered_point = {'time': point.get('time')} + # Add network fields if they exist + for key in ['netin', 'netout']: + if key in point: + filtered_point[key] = point[key] + rrd_data.append(filtered_point) + except json.JSONDecodeError: + rrd_error = f'RRD data for {vm_type.upper()} {vmid} is empty or corrupted' else: - # print(f"[v0] ERROR: Failed to get RRD data for VM/LXC") - pass + rrd_error = f'Failed to get RRD data: {rrd_result.stderr}' else: # For physical/bridge interfaces, get data from node RRD @@ -6223,38 +6226,42 @@ def api_network_interface_metrics(interface_name): capture_output=True, text=True, timeout=10) if rrd_result.returncode == 0: - all_data = json.loads(rrd_result.stdout) - # Filter to only network-related fields for this interface - for point in all_data: - filtered_point = {'time': point.get('time')} - # Add network fields if they exist - for key in ['netin', 'netout']: - if key in point: - filtered_point[key] = point[key] - rrd_data.append(filtered_point) - + try: + all_data = json.loads(rrd_result.stdout) + # Filter to only network-related fields for this interface + for point in all_data: + filtered_point = {'time': point.get('time')} + # Add network fields if they exist + for key in ['netin', 'netout']: + if key in point: + filtered_point[key] = point[key] + rrd_data.append(filtered_point) + except json.JSONDecodeError: + rrd_error = 'Node RRD data is empty or corrupted' else: - # print(f"[v0] ERROR: Failed to get RRD data for node") - pass + rrd_error = f'Failed to get RRD data: {rrd_result.stderr}' + # If there was an RRD error and no data collected, return error with details + if rrd_error and not rrd_data: + return jsonify({ + 'error': 'RRD data not available', + 'details': rrd_error, + 'suggestion': 'The RRD database may be empty or corrupted. Try: systemctl restart rrdcached' + }), 503 + return jsonify({ 'interface': interface_name, 'type': interface_type, 'timeframe': timeframe, - 'data': rrd_data + 'data': rrd_data, + 'warning': rrd_error if rrd_error else None # Include warning if there was an error but some data exists }) except Exception as e: return jsonify({'error': str(e)}), 500 -@app.route('/api/vms', methods=['GET']) -@require_auth -def api_vms(): - """Get virtual machine information""" - return jsonify(get_proxmox_vms()) - @app.route('/api/vms//metrics', methods=['GET']) @require_auth def api_vm_metrics(vmid): @@ -6316,9 +6323,22 @@ def api_vm_metrics(vmid): 'data': rrd_data }) else: - + # Check if RRD file is empty or corrupted + stderr_lower = rrd_result.stderr.lower() if rrd_result.stderr else '' + if 'rrd' in stderr_lower or 'no such file' in stderr_lower or 'empty' in stderr_lower: + return jsonify({ + 'error': 'RRD data not available', + 'details': f'The RRD database for {vm_type.upper()} {vmid} may be empty or corrupted.', + 'suggestion': 'Try restarting rrdcached: systemctl restart rrdcached' + }), 503 return jsonify({'error': f'Failed to get RRD data: {rrd_result.stderr}'}), 500 - + + except json.JSONDecodeError: + return jsonify({ + 'error': 'RRD data not available', + 'details': f'Unable to parse metrics data for VM/LXC {vmid}.', + 'suggestion': 'Try restarting rrdcached: systemctl restart rrdcached' + }), 503 except Exception as e: return jsonify({'error': str(e)}), 500 @@ -6381,8 +6401,23 @@ def api_node_metrics(): 'data': rrd_data }) else: + # Check if RRD file is empty or corrupted + stderr_lower = rrd_result.stderr.lower() if rrd_result.stderr else '' + if 'rrd' in stderr_lower or 'no such file' in stderr_lower or 'empty' in stderr_lower: + return jsonify({ + 'error': 'RRD data not available', + 'details': 'The RRD database file may be empty or corrupted. This can happen if rrdcached was not running properly after Proxmox installation.', + 'suggestion': 'Try restarting rrdcached: systemctl restart rrdcached' + }), 503 # Service Unavailable - more appropriate than 500 return jsonify({'error': f'Failed to get RRD data: {rrd_result.stderr}'}), 500 + except json.JSONDecodeError: + # pvesh returned invalid JSON - likely empty RRD + return jsonify({ + 'error': 'RRD data not available', + 'details': 'Unable to parse metrics data. The RRD database may be empty or corrupted.', + 'suggestion': 'Try restarting rrdcached: systemctl restart rrdcached' + }), 503 except Exception as e: return jsonify({'error': str(e)}), 500 diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py index 8a75ac86..b1c5b79a 100644 --- a/AppImage/scripts/notification_templates.py +++ b/AppImage/scripts/notification_templates.py @@ -1001,7 +1001,7 @@ EVENT_GROUPS = { } -# ─── Template Renderer ─────────────────────────────────────────── +# ─── Template Renderer ─��───────────────────────────────────────── def _get_hostname() -> str: """Get short hostname for message titles.""" @@ -1622,9 +1622,11 @@ BLANK LINES: Insert between logical sections (VM entries, before summary, before ═══ EXAMPLES (follow these formats) ═══ +IMPORTANT: {hostname} is a placeholder. Always use the ACTUAL hostname from the original message. + BACKUP START: [TITLE] -💾🚀 pve01: Backup started +💾🚀 {hostname}: Backup started [BODY] Backup job starting on storage PBS. 🏷️ VMs: web01 (100) @@ -1633,7 +1635,7 @@ Backup job starting on storage PBS. BACKUP COMPLETE: [TITLE] -💾✅ pve01: Backup complete +💾✅ {hostname}: Backup complete [BODY] Backup job finished on storage local-bak. @@ -1647,7 +1649,7 @@ Backup job finished on storage local-bak. BACKUP PARTIAL FAIL: [TITLE] -💾❌ pve01: Backup partially failed +💾❌ {hostname}: Backup partially failed [BODY] Backup job finished with errors. @@ -1662,7 +1664,7 @@ Backup job finished with errors. UPDATES: [TITLE] -📦 amd: Updates available +📦 {hostname}: Updates available [BODY] 📦 Total updates: 24 🔒 Security updates: 6 @@ -1673,14 +1675,14 @@ UPDATES: VM/CT START: [TITLE] -🚀 pve01: VM arch-linux (100) started +🚀 {hostname}: VM arch-linux (100) started [BODY] 🏷️ Virtual machine arch-linux (ID: 100) ✔️ Now running HEALTH DEGRADED: [TITLE] -⚠️ amd: Health warning — Disk I/O +⚠️ {hostname}: Health warning — Disk I/O [BODY] 💿 Device: /dev/sda ⚠️ 1 sector unreadable (pending)