mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-28 14:10:40 +00:00
update health_persistence.py
This commit is contained in:
@@ -458,7 +458,7 @@ def delete_storage_exclusion(storage_name):
|
|||||||
return jsonify({'error': str(e)}), 500
|
return jsonify({'error': str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════════
|
# ══════════════════════════════════════════════════════════════════════════
|
||||||
# NETWORK INTERFACE EXCLUSION ROUTES
|
# NETWORK INTERFACE EXCLUSION ROUTES
|
||||||
# ═══════════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
|||||||
@@ -228,7 +228,6 @@ class HealthMonitor:
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
"""Initialize health monitor with state tracking"""
|
"""Initialize health monitor with state tracking"""
|
||||||
print("[HealthMonitor] Version 2026-03-31-v2 - Stale resource cleanup enabled")
|
|
||||||
self.state_history = defaultdict(list)
|
self.state_history = defaultdict(list)
|
||||||
self.last_check_times = {}
|
self.last_check_times = {}
|
||||||
self.cached_results = {}
|
self.cached_results = {}
|
||||||
@@ -1218,10 +1217,7 @@ class HealthMonitor:
|
|||||||
'dismissable': True,
|
'dismissable': True,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
# Update worst_health for the disk (persists even if current error clears)
|
# Register the disk for observation tracking (worst_health no longer used)
|
||||||
# Use serial for proper USB disk tracking
|
|
||||||
health_persistence.update_disk_worst_health(device, disk_serial if disk_serial else None, severity.lower())
|
|
||||||
# Also register the disk for observation tracking
|
|
||||||
if disk_serial:
|
if disk_serial:
|
||||||
health_persistence.register_disk(device, disk_serial, disk_model, 0)
|
health_persistence.register_disk(device, disk_serial, disk_model, 0)
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -1242,7 +1238,7 @@ class HealthMonitor:
|
|||||||
if disk_path not in storage_details or storage_details[disk_path].get('status') == 'OK':
|
if disk_path not in storage_details or storage_details[disk_path].get('status') == 'OK':
|
||||||
issues.append(f'{disk_path}: {disk_info.get("reason", "I/O errors")}')
|
issues.append(f'{disk_path}: {disk_info.get("reason", "I/O errors")}')
|
||||||
storage_details[disk_path] = disk_info
|
storage_details[disk_path] = disk_info
|
||||||
# Update worst_health for I/O errors
|
|
||||||
device = disk_path.replace('/dev/', '')
|
device = disk_path.replace('/dev/', '')
|
||||||
io_severity = disk_info.get('status', 'WARNING').lower()
|
io_severity = disk_info.get('status', 'WARNING').lower()
|
||||||
|
|
||||||
@@ -1262,8 +1258,8 @@ class HealthMonitor:
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Register the disk for observation tracking (worst_health no longer used)
|
||||||
try:
|
try:
|
||||||
health_persistence.update_disk_worst_health(device, io_serial if io_serial else None, io_severity)
|
|
||||||
if io_serial:
|
if io_serial:
|
||||||
health_persistence.register_disk(device, io_serial, io_model, 0)
|
health_persistence.register_disk(device, io_serial, io_model, 0)
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -1459,24 +1455,10 @@ class HealthMonitor:
|
|||||||
serial = disk_info.get('serial', '')
|
serial = disk_info.get('serial', '')
|
||||||
model = disk_info.get('model', '')
|
model = disk_info.get('model', '')
|
||||||
|
|
||||||
# Get worst_health from persistence
|
# Use current status directly from Proxmox/SMART - no persistent worst_health
|
||||||
|
# Historical observations are preserved separately in disk_observations table
|
||||||
current_status = error_info.get('status', 'WARNING')
|
current_status = error_info.get('status', 'WARNING')
|
||||||
try:
|
final_status = current_status
|
||||||
health_status = health_persistence.get_disk_health_status(device_name, serial if serial else None)
|
|
||||||
worst_health = health_status.get('worst_health', 'healthy')
|
|
||||||
|
|
||||||
# Final health = max(current, worst)
|
|
||||||
health_order = {'healthy': 0, 'ok': 0, 'warning': 1, 'critical': 2}
|
|
||||||
current_level = health_order.get(current_status.lower(), 1)
|
|
||||||
worst_level = health_order.get(worst_health.lower(), 0)
|
|
||||||
|
|
||||||
if worst_level > current_level:
|
|
||||||
# worst_health is worse, use it
|
|
||||||
final_status = worst_health.upper()
|
|
||||||
else:
|
|
||||||
final_status = current_status
|
|
||||||
except Exception:
|
|
||||||
final_status = current_status
|
|
||||||
|
|
||||||
# Build detail string with serial/model if available
|
# Build detail string with serial/model if available
|
||||||
detail = error_info.get('detail', error_info.get('reason', 'Unknown error'))
|
detail = error_info.get('detail', error_info.get('reason', 'Unknown error'))
|
||||||
|
|||||||
@@ -868,7 +868,6 @@ class HealthPersistence:
|
|||||||
return self._cleanup_old_errors_impl()
|
return self._cleanup_old_errors_impl()
|
||||||
|
|
||||||
def _cleanup_old_errors_impl(self):
|
def _cleanup_old_errors_impl(self):
|
||||||
print("[HealthPersistence] Running cleanup_old_errors...")
|
|
||||||
conn = self._get_conn()
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
@@ -963,14 +962,13 @@ class HealthPersistence:
|
|||||||
now_iso = now.isoformat()
|
now_iso = now.isoformat()
|
||||||
|
|
||||||
# Get all active (unresolved) errors with first_seen and last_seen for age checks
|
# Get all active (unresolved) errors with first_seen and last_seen for age checks
|
||||||
|
# An error is considered unresolved if resolution_type is NULL or empty
|
||||||
|
# (resolved_at alone is not sufficient - it may be in an inconsistent state)
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
SELECT id, error_key, category, reason, first_seen, last_seen, severity FROM errors
|
SELECT id, error_key, category, reason, first_seen, last_seen, severity FROM errors
|
||||||
WHERE resolved_at IS NULL
|
WHERE resolution_type IS NULL OR resolution_type = ''
|
||||||
''')
|
''')
|
||||||
active_errors = cursor.fetchall()
|
active_errors = cursor.fetchall()
|
||||||
|
|
||||||
print(f"[HealthPersistence] _cleanup_stale_resources: Found {len(active_errors)} active errors to check")
|
|
||||||
|
|
||||||
resolved_count = 0
|
resolved_count = 0
|
||||||
|
|
||||||
# Cache for expensive checks (avoid repeated subprocess calls)
|
# Cache for expensive checks (avoid repeated subprocess calls)
|
||||||
@@ -1086,13 +1084,9 @@ class HealthPersistence:
|
|||||||
# Check if VM/CT still exists (covers: vms/vmct categories, vm_*, ct_*, vmct_* error keys)
|
# Check if VM/CT still exists (covers: vms/vmct categories, vm_*, ct_*, vmct_* error keys)
|
||||||
if category in ('vms', 'vmct') or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_') or error_key.startswith('vmct_'))):
|
if category in ('vms', 'vmct') or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_') or error_key.startswith('vmct_'))):
|
||||||
vmid = extract_vmid_from_text(error_key) or extract_vmid_from_text(reason)
|
vmid = extract_vmid_from_text(error_key) or extract_vmid_from_text(reason)
|
||||||
print(f"[HealthPersistence] Checking VM/CT error: key={error_key}, category={category}, vmid={vmid}")
|
if vmid and not check_vm_ct_cached(vmid):
|
||||||
if vmid:
|
should_resolve = True
|
||||||
exists = check_vm_ct_cached(vmid)
|
resolution_reason = 'VM/CT deleted'
|
||||||
print(f"[HealthPersistence] VM/CT {vmid} exists: {exists}")
|
|
||||||
if not exists:
|
|
||||||
should_resolve = True
|
|
||||||
resolution_reason = 'VM/CT deleted'
|
|
||||||
|
|
||||||
# === DISK ERRORS ===
|
# === DISK ERRORS ===
|
||||||
# Check if disk device or ZFS pool still exists
|
# Check if disk device or ZFS pool still exists
|
||||||
@@ -1207,7 +1201,6 @@ class HealthPersistence:
|
|||||||
resolution_reason = 'Stale error (no activity >7d)'
|
resolution_reason = 'Stale error (no activity >7d)'
|
||||||
|
|
||||||
if should_resolve:
|
if should_resolve:
|
||||||
print(f"[HealthPersistence] Resolving error: {error_key} - {resolution_reason}")
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
UPDATE errors SET resolved_at = ?, resolution_type = 'auto', resolution_reason = ?
|
UPDATE errors SET resolved_at = ?, resolution_type = 'auto', resolution_reason = ?
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
@@ -1862,130 +1855,10 @@ class HealthPersistence:
|
|||||||
pass
|
pass
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def update_disk_worst_health(self, device_name: str, serial: Optional[str],
|
# NOTE: update_disk_worst_health, get_disk_health_status, clear_disk_health_history
|
||||||
new_health: str) -> bool:
|
# were removed. The disk health badge now shows the CURRENT status from Proxmox/SMART
|
||||||
"""Update worst_health if new_health is worse than current.
|
# directly, not a persistent "worst_health". Historical observations are preserved
|
||||||
|
# in disk_observations table and shown separately via the "X obs." badge.
|
||||||
Health hierarchy: healthy < warning < critical
|
|
||||||
Only escalates, never downgrades automatically.
|
|
||||||
|
|
||||||
Returns True if worst_health was updated.
|
|
||||||
"""
|
|
||||||
health_order = {'healthy': 0, 'warning': 1, 'critical': 2}
|
|
||||||
new_level = health_order.get(new_health.lower(), 0)
|
|
||||||
|
|
||||||
if new_level == 0: # healthy never updates worst_health
|
|
||||||
return False
|
|
||||||
|
|
||||||
now = datetime.now().isoformat()
|
|
||||||
try:
|
|
||||||
conn = self._get_conn()
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
disk_id = self._get_disk_registry_id(cursor, device_name, serial)
|
|
||||||
if not disk_id:
|
|
||||||
# Register disk first
|
|
||||||
self.register_disk(device_name.replace('/dev/', ''), serial)
|
|
||||||
disk_id = self._get_disk_registry_id(cursor, device_name, serial)
|
|
||||||
|
|
||||||
if not disk_id:
|
|
||||||
conn.close()
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Get current worst_health
|
|
||||||
cursor.execute('SELECT worst_health FROM disk_registry WHERE id = ?', (disk_id,))
|
|
||||||
row = cursor.fetchone()
|
|
||||||
current_worst = row[0] if row and row[0] else 'healthy'
|
|
||||||
current_level = health_order.get(current_worst.lower(), 0)
|
|
||||||
|
|
||||||
# Only update if new health is worse
|
|
||||||
if new_level > current_level:
|
|
||||||
cursor.execute('''
|
|
||||||
UPDATE disk_registry
|
|
||||||
SET worst_health = ?, worst_health_date = ?, admin_cleared = NULL
|
|
||||||
WHERE id = ?
|
|
||||||
''', (new_health.lower(), now, disk_id))
|
|
||||||
conn.commit()
|
|
||||||
conn.close()
|
|
||||||
return True
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
return False
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[HealthPersistence] Error updating worst_health for {device_name}: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_disk_health_status(self, device_name: str, serial: Optional[str] = None) -> Dict[str, Any]:
|
|
||||||
"""Get the health status of a disk including worst_health.
|
|
||||||
|
|
||||||
Returns dict with:
|
|
||||||
- worst_health: 'healthy', 'warning', or 'critical'
|
|
||||||
- worst_health_date: ISO timestamp when worst_health was set
|
|
||||||
- admin_cleared: ISO timestamp if admin manually cleared the health
|
|
||||||
- observations_count: Number of recorded observations
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
conn = self._get_conn()
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
disk_id = self._get_disk_registry_id(cursor, device_name, serial)
|
|
||||||
if not disk_id:
|
|
||||||
conn.close()
|
|
||||||
return {'worst_health': 'healthy', 'observations_count': 0}
|
|
||||||
|
|
||||||
cursor.execute('''
|
|
||||||
SELECT worst_health, worst_health_date, admin_cleared
|
|
||||||
FROM disk_registry WHERE id = ?
|
|
||||||
''', (disk_id,))
|
|
||||||
row = cursor.fetchone()
|
|
||||||
|
|
||||||
# Count observations
|
|
||||||
cursor.execute(
|
|
||||||
'SELECT COUNT(*) FROM disk_observations WHERE disk_registry_id = ? AND dismissed = 0',
|
|
||||||
(disk_id,))
|
|
||||||
obs_count = cursor.fetchone()[0]
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if row:
|
|
||||||
return {
|
|
||||||
'worst_health': row[0] or 'healthy',
|
|
||||||
'worst_health_date': row[1],
|
|
||||||
'admin_cleared': row[2],
|
|
||||||
'observations_count': obs_count
|
|
||||||
}
|
|
||||||
return {'worst_health': 'healthy', 'observations_count': obs_count}
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[HealthPersistence] Error getting disk health for {device_name}: {e}")
|
|
||||||
return {'worst_health': 'healthy', 'observations_count': 0}
|
|
||||||
|
|
||||||
def clear_disk_health_history(self, device_name: str, serial: Optional[str] = None) -> bool:
|
|
||||||
"""Admin action: clear worst_health back to healthy.
|
|
||||||
|
|
||||||
This resets the health status but keeps all observations for audit.
|
|
||||||
Records when the admin cleared it for accountability.
|
|
||||||
"""
|
|
||||||
now = datetime.now().isoformat()
|
|
||||||
try:
|
|
||||||
conn = self._get_conn()
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
disk_id = self._get_disk_registry_id(cursor, device_name, serial)
|
|
||||||
if not disk_id:
|
|
||||||
conn.close()
|
|
||||||
return False
|
|
||||||
|
|
||||||
cursor.execute('''
|
|
||||||
UPDATE disk_registry
|
|
||||||
SET worst_health = 'healthy', worst_health_date = NULL, admin_cleared = ?
|
|
||||||
WHERE id = ?
|
|
||||||
''', (now, disk_id))
|
|
||||||
conn.commit()
|
|
||||||
conn.close()
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[HealthPersistence] Error clearing health for {device_name}: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def record_disk_observation(self, device_name: str, serial: Optional[str],
|
def record_disk_observation(self, device_name: str, serial: Optional[str],
|
||||||
error_type: str, error_signature: str,
|
error_type: str, error_signature: str,
|
||||||
@@ -2025,9 +1898,7 @@ class HealthPersistence:
|
|||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
# Observation recorded - worst_health no longer updated (badge shows current SMART status)
|
||||||
# Update worst_health based on observation severity
|
|
||||||
self.update_disk_worst_health(clean_dev, serial, severity)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[HealthPersistence] Error recording disk observation: {e}")
|
print(f"[HealthPersistence] Error recording disk observation: {e}")
|
||||||
|
|||||||
@@ -197,7 +197,7 @@ def capture_journal_context(keywords: list, lines: int = 30,
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
# ─── Journal Watcher (Real-time) ─────────────────────────────────
|
# ─── Journal Watcher (Real-time) ────────────────────────────────
|
||||||
|
|
||||||
class JournalWatcher:
|
class JournalWatcher:
|
||||||
"""Watches journald in real-time for critical system events.
|
"""Watches journald in real-time for critical system events.
|
||||||
@@ -964,10 +964,7 @@ class JournalWatcher:
|
|||||||
raw_message=raw_msg,
|
raw_message=raw_msg,
|
||||||
severity='warning',
|
severity='warning',
|
||||||
)
|
)
|
||||||
|
# Observation recorded - worst_health no longer used (badge shows current SMART status)
|
||||||
# Update worst_health for permanent tracking (record_disk_observation
|
|
||||||
# already does this, but we ensure it here for safety)
|
|
||||||
health_persistence.update_disk_worst_health(base_dev, serial, 'warning')
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}")
|
print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}")
|
||||||
|
|||||||
@@ -883,7 +883,7 @@ TEMPLATES = {
|
|||||||
'default_enabled': True,
|
'default_enabled': True,
|
||||||
},
|
},
|
||||||
|
|
||||||
# ── ProxMenux updates ��─
|
# ── ProxMenux updates ──
|
||||||
'proxmenux_update': {
|
'proxmenux_update': {
|
||||||
'title': '{hostname}: ProxMenux {new_version} available',
|
'title': '{hostname}: ProxMenux {new_version} available',
|
||||||
'body': (
|
'body': (
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ import time
|
|||||||
import threading
|
import threading
|
||||||
from typing import Set, List, Tuple, Optional
|
from typing import Set, List, Tuple, Optional
|
||||||
|
|
||||||
# ─── Configuration ───────────────────────���───────────────────────────────────
|
# ─── Configuration ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
# Grace period durations (seconds)
|
# Grace period durations (seconds)
|
||||||
STARTUP_VM_GRACE_SECONDS = 180 # 3 minutes for VM/CT start aggregation
|
STARTUP_VM_GRACE_SECONDS = 180 # 3 minutes for VM/CT start aggregation
|
||||||
|
|||||||
Reference in New Issue
Block a user