mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-18 10:02:16 +00:00
601 lines
22 KiB
Python
601 lines
22 KiB
Python
"""
|
||
Flask routes for health monitoring with persistence support
|
||
"""
|
||
|
||
from flask import Blueprint, jsonify, request
|
||
from health_monitor import health_monitor
|
||
from health_persistence import health_persistence
|
||
|
||
health_bp = Blueprint('health', __name__)
|
||
|
||
@health_bp.route('/api/health/status', methods=['GET'])
|
||
def get_health_status():
|
||
"""Get overall health status summary"""
|
||
try:
|
||
status = health_monitor.get_overall_status()
|
||
return jsonify(status)
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
@health_bp.route('/api/health/details', methods=['GET'])
|
||
def get_health_details():
|
||
"""Get detailed health status with all checks"""
|
||
try:
|
||
details = health_monitor.get_detailed_status()
|
||
return jsonify(details)
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
@health_bp.route('/api/system-info', methods=['GET'])
|
||
def get_system_info():
|
||
"""
|
||
Get lightweight system info for header display.
|
||
Returns: hostname, uptime, and health status with proper structure.
|
||
"""
|
||
try:
|
||
info = health_monitor.get_system_info()
|
||
|
||
if 'health' in info:
|
||
status_map = {
|
||
'OK': 'healthy',
|
||
'WARNING': 'warning',
|
||
'CRITICAL': 'critical',
|
||
'UNKNOWN': 'warning'
|
||
}
|
||
current_status = info['health'].get('status', 'OK').upper()
|
||
info['health']['status'] = status_map.get(current_status, 'healthy')
|
||
|
||
return jsonify(info)
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
@health_bp.route('/api/health/acknowledge', methods=['POST'])
|
||
def acknowledge_error():
|
||
"""
|
||
Acknowledge/dismiss an error manually.
|
||
Returns details about the acknowledged error including original severity
|
||
and suppression period info.
|
||
"""
|
||
try:
|
||
data = request.get_json()
|
||
if not data or 'error_key' not in data:
|
||
return jsonify({'error': 'error_key is required'}), 400
|
||
|
||
error_key = data['error_key']
|
||
result = health_persistence.acknowledge_error(error_key)
|
||
|
||
if result.get('success'):
|
||
# Invalidate cached health results so next fetch reflects the dismiss
|
||
# Use the error's category to clear the correct cache
|
||
category = result.get('category', '')
|
||
cache_key_map = {
|
||
'logs': 'logs_analysis',
|
||
'pve_services': 'pve_services',
|
||
'updates': 'updates_check',
|
||
'security': 'security_check',
|
||
'temperature': 'cpu_check',
|
||
'network': 'network_check',
|
||
'disks': 'storage_check',
|
||
'vms': 'vms_check',
|
||
}
|
||
cache_key = cache_key_map.get(category)
|
||
if cache_key:
|
||
health_monitor.last_check_times.pop(cache_key, None)
|
||
health_monitor.cached_results.pop(cache_key, None)
|
||
|
||
# Also invalidate ALL background/overall caches so next fetch reflects dismiss
|
||
for ck in ['_bg_overall', '_bg_detailed', 'overall_health']:
|
||
health_monitor.last_check_times.pop(ck, None)
|
||
health_monitor.cached_results.pop(ck, None)
|
||
|
||
# Use the per-record suppression hours from acknowledge_error()
|
||
sup_hours = result.get('suppression_hours', 24)
|
||
if sup_hours == -1:
|
||
suppression_label = 'permanently'
|
||
elif sup_hours >= 8760:
|
||
suppression_label = f'{sup_hours // 8760} year(s)'
|
||
elif sup_hours >= 720:
|
||
suppression_label = f'{sup_hours // 720} month(s)'
|
||
elif sup_hours >= 168:
|
||
suppression_label = f'{sup_hours // 168} week(s)'
|
||
elif sup_hours >= 72:
|
||
suppression_label = f'{sup_hours // 24} day(s)'
|
||
else:
|
||
suppression_label = f'{sup_hours} hours'
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'message': f'Error dismissed for {suppression_label}',
|
||
'error_key': error_key,
|
||
'original_severity': result.get('original_severity', 'WARNING'),
|
||
'category': category,
|
||
'suppression_hours': sup_hours,
|
||
'suppression_label': suppression_label,
|
||
'acknowledged_at': result.get('acknowledged_at')
|
||
})
|
||
else:
|
||
return jsonify({
|
||
'success': False,
|
||
'message': 'Error not found or already dismissed',
|
||
'error_key': error_key
|
||
}), 404
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
@health_bp.route('/api/health/active-errors', methods=['GET'])
|
||
def get_active_errors():
|
||
"""Get all active persistent errors"""
|
||
try:
|
||
category = request.args.get('category')
|
||
errors = health_persistence.get_active_errors(category)
|
||
return jsonify({'errors': errors})
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
@health_bp.route('/api/health/dismissed', methods=['GET'])
|
||
def get_dismissed_errors():
|
||
"""
|
||
Get dismissed errors that are still within their suppression period.
|
||
These are shown as INFO items with a 'Dismissed' badge in the frontend.
|
||
"""
|
||
try:
|
||
dismissed = health_persistence.get_dismissed_errors()
|
||
return jsonify({'dismissed': dismissed})
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
@health_bp.route('/api/health/full', methods=['GET'])
|
||
def get_full_health():
|
||
"""
|
||
Get complete health data in a single request: detailed status + active errors + dismissed.
|
||
Uses background-cached results if fresh (< 6 min) for instant response,
|
||
otherwise runs a fresh check.
|
||
"""
|
||
import time as _time
|
||
try:
|
||
# Try to use the background-cached detailed result for instant response
|
||
bg_key = '_bg_detailed'
|
||
bg_last = health_monitor.last_check_times.get(bg_key, 0)
|
||
bg_age = _time.time() - bg_last
|
||
|
||
if bg_age < 360 and bg_key in health_monitor.cached_results:
|
||
# Use cached result (at most ~5 min old)
|
||
details = health_monitor.cached_results[bg_key]
|
||
else:
|
||
# No fresh cache, run live (first load or cache expired)
|
||
details = health_monitor.get_detailed_status()
|
||
|
||
active_errors = health_persistence.get_active_errors()
|
||
dismissed = health_persistence.get_dismissed_errors()
|
||
custom_suppressions = health_persistence.get_custom_suppressions()
|
||
|
||
return jsonify({
|
||
'health': details,
|
||
'active_errors': active_errors,
|
||
'dismissed': dismissed,
|
||
'custom_suppressions': custom_suppressions,
|
||
'timestamp': details.get('timestamp')
|
||
})
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
@health_bp.route('/api/health/cleanup-orphans', methods=['POST'])
|
||
def cleanup_orphan_errors():
|
||
"""
|
||
Clean up errors for devices that no longer exist in the system.
|
||
Useful when USB drives or temporary devices are disconnected.
|
||
"""
|
||
import os
|
||
import re
|
||
try:
|
||
cleaned = []
|
||
# Get all active disk errors
|
||
disk_errors = health_persistence.get_active_errors(category='disks')
|
||
|
||
for err in disk_errors:
|
||
err_key = err.get('error_key', '')
|
||
details = err.get('details', {})
|
||
if isinstance(details, str):
|
||
try:
|
||
import json as _json
|
||
details = _json.loads(details)
|
||
except Exception:
|
||
details = {}
|
||
|
||
device = details.get('device', '')
|
||
base_disk = details.get('disk', '')
|
||
|
||
# Try to determine the device path
|
||
dev_path = None
|
||
if base_disk:
|
||
dev_path = f'/dev/{base_disk}'
|
||
elif device:
|
||
dev_path = device if device.startswith('/dev/') else f'/dev/{device}'
|
||
elif err_key.startswith('disk_'):
|
||
# Extract device from error_key
|
||
dev_name = err_key.replace('disk_fs_', '').replace('disk_', '')
|
||
dev_name = re.sub(r'_.*$', '', dev_name) # Remove suffix
|
||
if dev_name:
|
||
dev_path = f'/dev/{dev_name}'
|
||
|
||
if dev_path:
|
||
# Also check base disk (remove partition number)
|
||
base_path = re.sub(r'\d+$', '', dev_path)
|
||
if not os.path.exists(dev_path) and not os.path.exists(base_path):
|
||
health_persistence.resolve_error(err_key, 'Device no longer present (manual cleanup)')
|
||
cleaned.append({'error_key': err_key, 'device': dev_path})
|
||
|
||
# Also cleanup disk_observations for non-existent devices
|
||
try:
|
||
health_persistence.cleanup_orphan_observations()
|
||
except Exception:
|
||
pass
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'cleaned_count': len(cleaned),
|
||
'cleaned_errors': cleaned
|
||
})
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
@health_bp.route('/api/health/pending-notifications', methods=['GET'])
|
||
def get_pending_notifications():
|
||
"""
|
||
Get events pending notification (for future Telegram/Gotify/Discord integration).
|
||
This endpoint will be consumed by the Notification Service (Bloque A).
|
||
"""
|
||
try:
|
||
pending = health_persistence.get_pending_notifications()
|
||
return jsonify({'pending': pending, 'count': len(pending)})
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
@health_bp.route('/api/health/mark-notified', methods=['POST'])
|
||
def mark_events_notified():
|
||
"""
|
||
Mark events as notified after notification was sent successfully.
|
||
Used by the Notification Service (Bloque A) after sending alerts.
|
||
"""
|
||
try:
|
||
data = request.get_json()
|
||
if not data or 'event_ids' not in data:
|
||
return jsonify({'error': 'event_ids array is required'}), 400
|
||
|
||
event_ids = data['event_ids']
|
||
if not isinstance(event_ids, list):
|
||
return jsonify({'error': 'event_ids must be an array'}), 400
|
||
|
||
health_persistence.mark_events_notified(event_ids)
|
||
return jsonify({'success': True, 'marked_count': len(event_ids)})
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
|
||
@health_bp.route('/api/health/settings', methods=['GET'])
|
||
def get_health_settings():
|
||
"""
|
||
Get per-category suppression duration settings.
|
||
Returns all health categories with their current configured hours.
|
||
"""
|
||
try:
|
||
categories = health_persistence.get_suppression_categories()
|
||
return jsonify({'categories': categories})
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
|
||
@health_bp.route('/api/health/settings', methods=['POST'])
|
||
def save_health_settings():
|
||
"""
|
||
Save per-category suppression duration settings.
|
||
Expects JSON body with key-value pairs like: {"suppress_cpu": "168", "suppress_memory": "-1"}
|
||
Valid values: 24, 72, 168, 720, 8760, -1 (permanent), or any positive integer for custom.
|
||
"""
|
||
try:
|
||
data = request.get_json()
|
||
if not data:
|
||
return jsonify({'error': 'No settings provided'}), 400
|
||
|
||
valid_keys = set(health_persistence.CATEGORY_SETTING_MAP.values())
|
||
updated = []
|
||
|
||
for key, value in data.items():
|
||
if key not in valid_keys:
|
||
continue
|
||
|
||
try:
|
||
hours = int(value)
|
||
# Validate: must be -1 (permanent) or positive
|
||
if hours != -1 and hours < 1:
|
||
continue
|
||
health_persistence.set_setting(key, str(hours))
|
||
updated.append(key)
|
||
except (ValueError, TypeError):
|
||
continue
|
||
|
||
# Retroactively sync all existing dismissed errors
|
||
# so changes are effective immediately, not just on next dismiss
|
||
synced_count = health_persistence.sync_dismissed_suppression()
|
||
|
||
return jsonify({
|
||
'success': True,
|
||
'updated': updated,
|
||
'count': len(updated),
|
||
'synced_dismissed': synced_count
|
||
})
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
|
||
# ── Remote Storage Exclusions Endpoints ──
|
||
|
||
@health_bp.route('/api/health/remote-storages', methods=['GET'])
|
||
def get_remote_storages():
|
||
"""
|
||
Get list of all remote storages with their exclusion status.
|
||
Remote storages are those that can be offline (PBS, NFS, CIFS, etc.)
|
||
"""
|
||
try:
|
||
from proxmox_storage_monitor import proxmox_storage_monitor
|
||
|
||
# Get current storage status
|
||
storage_status = proxmox_storage_monitor.get_storage_status()
|
||
all_storages = storage_status.get('available', []) + storage_status.get('unavailable', [])
|
||
|
||
# Filter to only remote types
|
||
remote_types = health_persistence.REMOTE_STORAGE_TYPES
|
||
remote_storages = [s for s in all_storages if s.get('type', '').lower() in remote_types]
|
||
|
||
# Get current exclusions
|
||
exclusions = {e['storage_name']: e for e in health_persistence.get_excluded_storages()}
|
||
|
||
# Combine info
|
||
result = []
|
||
for storage in remote_storages:
|
||
name = storage.get('name', '')
|
||
exclusion = exclusions.get(name, {})
|
||
result.append({
|
||
'name': name,
|
||
'type': storage.get('type', 'unknown'),
|
||
'status': storage.get('status', 'unknown'),
|
||
'total': storage.get('total', 0),
|
||
'used': storage.get('used', 0),
|
||
'available': storage.get('available', 0),
|
||
'percent': storage.get('percent', 0),
|
||
'exclude_health': exclusion.get('exclude_health', 0) == 1,
|
||
'exclude_notifications': exclusion.get('exclude_notifications', 0) == 1,
|
||
'excluded_at': exclusion.get('excluded_at'),
|
||
'reason': exclusion.get('reason')
|
||
})
|
||
|
||
return jsonify({
|
||
'storages': result,
|
||
'remote_types': list(remote_types)
|
||
})
|
||
except ImportError:
|
||
return jsonify({'error': 'Storage monitor not available', 'storages': []}), 200
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
|
||
@health_bp.route('/api/health/storage-exclusions', methods=['GET'])
|
||
def get_storage_exclusions():
|
||
"""Get all storage exclusions."""
|
||
try:
|
||
exclusions = health_persistence.get_excluded_storages()
|
||
return jsonify({'exclusions': exclusions})
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
|
||
@health_bp.route('/api/health/storage-exclusions', methods=['POST'])
|
||
def save_storage_exclusion():
|
||
"""
|
||
Add or update a storage exclusion.
|
||
|
||
Request body:
|
||
{
|
||
"storage_name": "pbs-backup",
|
||
"storage_type": "pbs",
|
||
"exclude_health": true,
|
||
"exclude_notifications": true,
|
||
"reason": "PBS server is offline daily"
|
||
}
|
||
"""
|
||
try:
|
||
data = request.get_json()
|
||
if not data or 'storage_name' not in data:
|
||
return jsonify({'error': 'storage_name is required'}), 400
|
||
|
||
storage_name = data['storage_name']
|
||
storage_type = data.get('storage_type', 'unknown')
|
||
exclude_health = data.get('exclude_health', True)
|
||
exclude_notifications = data.get('exclude_notifications', True)
|
||
reason = data.get('reason')
|
||
|
||
# Check if already excluded
|
||
existing = health_persistence.get_excluded_storages()
|
||
exists = any(e['storage_name'] == storage_name for e in existing)
|
||
|
||
if exists:
|
||
# Update existing
|
||
success = health_persistence.update_storage_exclusion(
|
||
storage_name, exclude_health, exclude_notifications
|
||
)
|
||
else:
|
||
# Add new
|
||
success = health_persistence.exclude_storage(
|
||
storage_name, storage_type, exclude_health, exclude_notifications, reason
|
||
)
|
||
|
||
if success:
|
||
return jsonify({
|
||
'success': True,
|
||
'message': f'Storage {storage_name} exclusion saved',
|
||
'storage_name': storage_name
|
||
})
|
||
else:
|
||
return jsonify({'error': 'Failed to save exclusion'}), 500
|
||
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
|
||
@health_bp.route('/api/health/storage-exclusions/<storage_name>', methods=['DELETE'])
|
||
def delete_storage_exclusion(storage_name):
|
||
"""Remove a storage from the exclusion list."""
|
||
try:
|
||
success = health_persistence.remove_storage_exclusion(storage_name)
|
||
if success:
|
||
return jsonify({
|
||
'success': True,
|
||
'message': f'Storage {storage_name} removed from exclusions'
|
||
})
|
||
else:
|
||
return jsonify({'error': 'Storage not found in exclusions'}), 404
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# NETWORK INTERFACE EXCLUSION ROUTES
|
||
# ══════════════════<E29590><E29590><EFBFBD>════════════════════════════════════════════════════════
|
||
|
||
@health_bp.route('/api/health/interfaces', methods=['GET'])
|
||
def get_network_interfaces():
|
||
"""Get all network interfaces with their exclusion status."""
|
||
try:
|
||
import psutil
|
||
|
||
# Get all interfaces
|
||
net_if_stats = psutil.net_if_stats()
|
||
net_if_addrs = psutil.net_if_addrs()
|
||
|
||
# Get current exclusions
|
||
exclusions = {e['interface_name']: e for e in health_persistence.get_excluded_interfaces()}
|
||
|
||
result = []
|
||
for iface, stats in net_if_stats.items():
|
||
if iface == 'lo':
|
||
continue
|
||
|
||
# Determine interface type
|
||
if iface.startswith('vmbr'):
|
||
iface_type = 'bridge'
|
||
elif iface.startswith('bond'):
|
||
iface_type = 'bond'
|
||
elif iface.startswith(('vlan', 'veth')):
|
||
iface_type = 'vlan'
|
||
elif iface.startswith(('eth', 'ens', 'enp', 'eno')):
|
||
iface_type = 'physical'
|
||
else:
|
||
iface_type = 'other'
|
||
|
||
# Get IP address if any
|
||
ip_addr = None
|
||
if iface in net_if_addrs:
|
||
for addr in net_if_addrs[iface]:
|
||
if addr.family == 2: # IPv4
|
||
ip_addr = addr.address
|
||
break
|
||
|
||
exclusion = exclusions.get(iface, {})
|
||
result.append({
|
||
'name': iface,
|
||
'type': iface_type,
|
||
'is_up': stats.isup,
|
||
'speed': stats.speed,
|
||
'ip_address': ip_addr,
|
||
'exclude_health': exclusion.get('exclude_health', 0) == 1,
|
||
'exclude_notifications': exclusion.get('exclude_notifications', 0) == 1,
|
||
'excluded_at': exclusion.get('excluded_at'),
|
||
'reason': exclusion.get('reason')
|
||
})
|
||
|
||
# Sort: bridges first, then physical, then others
|
||
type_order = {'bridge': 0, 'bond': 1, 'physical': 2, 'vlan': 3, 'other': 4}
|
||
result.sort(key=lambda x: (type_order.get(x['type'], 5), x['name']))
|
||
|
||
return jsonify({'interfaces': result})
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
|
||
@health_bp.route('/api/health/interface-exclusions', methods=['GET'])
|
||
def get_interface_exclusions():
|
||
"""Get all interface exclusions."""
|
||
try:
|
||
exclusions = health_persistence.get_excluded_interfaces()
|
||
return jsonify({'exclusions': exclusions})
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
|
||
@health_bp.route('/api/health/interface-exclusions', methods=['POST'])
|
||
def save_interface_exclusion():
|
||
"""
|
||
Add or update an interface exclusion.
|
||
|
||
Request body:
|
||
{
|
||
"interface_name": "vmbr0",
|
||
"interface_type": "bridge",
|
||
"exclude_health": true,
|
||
"exclude_notifications": true,
|
||
"reason": "Intentionally disabled bridge"
|
||
}
|
||
"""
|
||
try:
|
||
data = request.get_json()
|
||
if not data or 'interface_name' not in data:
|
||
return jsonify({'error': 'interface_name is required'}), 400
|
||
|
||
interface_name = data['interface_name']
|
||
interface_type = data.get('interface_type', 'unknown')
|
||
exclude_health = data.get('exclude_health', True)
|
||
exclude_notifications = data.get('exclude_notifications', True)
|
||
reason = data.get('reason')
|
||
|
||
# Check if already excluded
|
||
existing = health_persistence.get_excluded_interfaces()
|
||
exists = any(e['interface_name'] == interface_name for e in existing)
|
||
|
||
if exists:
|
||
# Update existing
|
||
success = health_persistence.update_interface_exclusion(
|
||
interface_name, exclude_health, exclude_notifications
|
||
)
|
||
else:
|
||
# Add new
|
||
success = health_persistence.exclude_interface(
|
||
interface_name, interface_type, exclude_health, exclude_notifications, reason
|
||
)
|
||
|
||
if success:
|
||
return jsonify({
|
||
'success': True,
|
||
'message': f'Interface {interface_name} exclusion saved',
|
||
'interface_name': interface_name
|
||
})
|
||
else:
|
||
return jsonify({'error': 'Failed to save exclusion'}), 500
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|
||
|
||
|
||
@health_bp.route('/api/health/interface-exclusions/<interface_name>', methods=['DELETE'])
|
||
def delete_interface_exclusion(interface_name):
|
||
"""Remove an interface from the exclusion list."""
|
||
try:
|
||
success = health_persistence.remove_interface_exclusion(interface_name)
|
||
if success:
|
||
return jsonify({
|
||
'success': True,
|
||
'message': f'Interface {interface_name} removed from exclusions'
|
||
})
|
||
else:
|
||
return jsonify({'error': 'Interface not found in exclusions'}), 404
|
||
except Exception as e:
|
||
return jsonify({'error': str(e)}), 500
|