diff --git a/AppImage/components/hardware.tsx b/AppImage/components/hardware.tsx index 284851fa..23202032 100644 --- a/AppImage/components/hardware.tsx +++ b/AppImage/components/hardware.tsx @@ -162,43 +162,41 @@ const groupAndSortTemperatures = (temperatures: any[]) => { } export default function Hardware() { - // Static data - load once without refresh + // Static data - loaded once on mount. Static fields (CPU, motherboard, memory + // modules, PCI, disks, GPU list) don't change at runtime, so no auto-refresh. + // `mutateStatic` is triggered explicitly after GPU switch-mode changes. const { data: staticHardwareData, error: staticError, isLoading: staticLoading, + mutate: mutateStatic, } = useSWR("/api/hardware", swrFetcher, { revalidateOnFocus: false, revalidateOnReconnect: false, - refreshInterval: 0, // No auto-refresh for static data + refreshInterval: 0, }) - // Dynamic data - refresh every 5 seconds for temperatures, fans, power, ups + // Live data - only temperatures, fans, power, UPS. Polled every 5s. + // Backend /api/hardware/live uses cached ipmitool output (10s) so this is cheap. const { data: dynamicHardwareData, error: dynamicError, - isLoading: dynamicLoading, - } = useSWR("/api/hardware", swrFetcher, { - refreshInterval: 7000, + } = useSWR("/api/hardware/live", swrFetcher, { + refreshInterval: 5000, + revalidateOnFocus: true, + revalidateOnReconnect: true, + dedupingInterval: 2000, }) - // Merge static and dynamic data, preferring static for CPU/memory/PCI/disks + // Merge: static fields from initial load, live fields from the 5s poll. const hardwareData = staticHardwareData ? { - ...dynamicHardwareData, - // Keep static data from initial load - cpu: staticHardwareData.cpu, - motherboard: staticHardwareData.motherboard, - memory_modules: staticHardwareData.memory_modules, - pci_devices: staticHardwareData.pci_devices, - storage_devices: staticHardwareData.storage_devices, - gpus: staticHardwareData.gpus, - // Use dynamic data for these - temperatures: dynamicHardwareData?.temperatures, - fans: dynamicHardwareData?.fans, - power_meter: dynamicHardwareData?.power_meter, - power_supplies: dynamicHardwareData?.power_supplies, - ups: dynamicHardwareData?.ups, + ...staticHardwareData, + temperatures: dynamicHardwareData?.temperatures ?? staticHardwareData.temperatures, + fans: dynamicHardwareData?.fans ?? staticHardwareData.fans, + power_meter: dynamicHardwareData?.power_meter ?? staticHardwareData.power_meter, + power_supplies: dynamicHardwareData?.power_supplies ?? staticHardwareData.power_supplies, + ups: dynamicHardwareData?.ups ?? staticHardwareData.ups, } : undefined @@ -239,21 +237,6 @@ export default function Hardware() { const [showSwitchModeModal, setShowSwitchModeModal] = useState(false) const [switchModeParams, setSwitchModeParams] = useState<{ gpuSlot: string; targetMode: "lxc" | "vm" } | null>(null) - const fetcher = async (url: string) => { - const data = await fetchApi(url) - return data - } - - const { - data: hardwareDataSWR, - error: swrError, - isLoading: swrLoading, - mutate: mutateHardware, - } = useSWR("/api/hardware", fetcher, { - refreshInterval: 30000, - revalidateOnFocus: false, - }) - // Determine GPU mode based on driver (vfio-pci = VM, native driver = LXC) const getGpuSwitchMode = (gpu: GPU): "lxc" | "vm" | "unknown" => { const driver = gpu.pci_driver?.toLowerCase() || "" @@ -304,7 +287,7 @@ export default function Hardware() { const handleSwitchModeSave = (gpuSlot: string, e: React.MouseEvent) => { e.stopPropagation() const pendingMode = pendingSwitchModes[gpuSlot] - const gpu = hardwareDataSWR?.gpus?.find(g => g.slot === gpuSlot) + const gpu = hardwareData?.gpus?.find(g => g.slot === gpuSlot) const currentMode = gpu ? getGpuSwitchMode(gpu) : "unknown" if (pendingMode && pendingMode !== currentMode && gpu) { @@ -333,7 +316,7 @@ export default function Hardware() { setSwitchModeParams(null) setPendingSwitchModes({}) // Refresh hardware data - mutateHardware() + mutateStatic() } const handleInstallNvidiaDriver = () => { @@ -391,14 +374,14 @@ export default function Hardware() { } const findPCIDeviceForGPU = (gpu: GPU): PCIDevice | null => { - if (!hardwareDataSWR?.pci_devices || !gpu.slot) return null + if (!hardwareData?.pci_devices || !gpu.slot) return null // Try to find exact match first (e.g., "00:02.0") - let pciDevice = hardwareDataSWR.pci_devices.find((d) => d.slot === gpu.slot) + let pciDevice = hardwareData.pci_devices.find((d) => d.slot === gpu.slot) // If not found, try to match by partial slot (e.g., "00" matches "00:02.0") if (!pciDevice && gpu.slot.length <= 2) { - pciDevice = hardwareDataSWR.pci_devices.find( + pciDevice = hardwareData.pci_devices.find( (d) => d.slot.startsWith(gpu.slot + ":") && (d.type.toLowerCase().includes("vga") || @@ -417,7 +400,7 @@ export default function Hardware() { return realtimeGPUData.has_monitoring_tool === true } - if (swrLoading) { + if (isLoading) { return (
@@ -433,7 +416,7 @@ export default function Hardware() { return (
{/* System Information - CPU & Motherboard */} - {(hardwareDataSWR?.cpu || hardwareDataSWR?.motherboard) && ( + {(hardwareData?.cpu || hardwareData?.motherboard) && (
@@ -442,44 +425,44 @@ export default function Hardware() {
{/* CPU Info */} - {hardwareDataSWR?.cpu && Object.keys(hardwareDataSWR.cpu).length > 0 && ( + {hardwareData?.cpu && Object.keys(hardwareData.cpu).length > 0 && (

CPU

- {hardwareDataSWR.cpu.model && ( + {hardwareData.cpu.model && (
Model - {hardwareDataSWR.cpu.model} + {hardwareData.cpu.model}
)} - {hardwareDataSWR.cpu.cores_per_socket && hardwareDataSWR.cpu.sockets && ( + {hardwareData.cpu.cores_per_socket && hardwareData.cpu.sockets && (
Cores - {hardwareDataSWR.cpu.sockets} × {hardwareDataSWR.cpu.cores_per_socket} ={" "} - {hardwareDataSWR.cpu.sockets * hardwareDataSWR.cpu.cores_per_socket} cores + {hardwareData.cpu.sockets} × {hardwareData.cpu.cores_per_socket} ={" "} + {hardwareData.cpu.sockets * hardwareData.cpu.cores_per_socket} cores
)} - {hardwareDataSWR.cpu.total_threads && ( + {hardwareData.cpu.total_threads && (
Threads - {hardwareDataSWR.cpu.total_threads} + {hardwareData.cpu.total_threads}
)} - {hardwareDataSWR.cpu.l3_cache && ( + {hardwareData.cpu.l3_cache && (
L3 Cache - {hardwareDataSWR.cpu.l3_cache} + {hardwareData.cpu.l3_cache}
)} - {hardwareDataSWR.cpu.virtualization && ( + {hardwareData.cpu.virtualization && (
Virtualization - {hardwareDataSWR.cpu.virtualization} + {hardwareData.cpu.virtualization}
)}
@@ -487,41 +470,41 @@ export default function Hardware() { )} {/* Motherboard Info */} - {hardwareDataSWR?.motherboard && Object.keys(hardwareDataSWR.motherboard).length > 0 && ( + {hardwareData?.motherboard && Object.keys(hardwareData.motherboard).length > 0 && (

Motherboard

- {hardwareDataSWR.motherboard.manufacturer && ( + {hardwareData.motherboard.manufacturer && (
Manufacturer - {hardwareDataSWR.motherboard.manufacturer} + {hardwareData.motherboard.manufacturer}
)} - {hardwareDataSWR.motherboard.model && ( + {hardwareData.motherboard.model && (
Model - {hardwareDataSWR.motherboard.model} + {hardwareData.motherboard.model}
)} - {hardwareDataSWR.motherboard.bios?.vendor && ( + {hardwareData.motherboard.bios?.vendor && (
BIOS - {hardwareDataSWR.motherboard.bios.vendor} + {hardwareData.motherboard.bios.vendor}
)} - {hardwareDataSWR.motherboard.bios?.version && ( + {hardwareData.motherboard.bios?.version && (
Version - {hardwareDataSWR.motherboard.bios.version} + {hardwareData.motherboard.bios.version}
)} - {hardwareDataSWR.motherboard.bios?.date && ( + {hardwareData.motherboard.bios?.date && (
Date - {hardwareDataSWR.motherboard.bios.date} + {hardwareData.motherboard.bios.date}
)}
@@ -532,18 +515,18 @@ export default function Hardware() { )} {/* Memory Modules */} - {hardwareDataSWR?.memory_modules && hardwareDataSWR.memory_modules.length > 0 && ( + {hardwareData?.memory_modules && hardwareData.memory_modules.length > 0 && (

Memory Modules

- {hardwareDataSWR.memory_modules.length} installed + {hardwareData.memory_modules.length} installed
- {hardwareDataSWR.memory_modules.map((module, index) => ( + {hardwareData.memory_modules.map((module, index) => (
{module.slot}
@@ -590,29 +573,29 @@ export default function Hardware() { )} {/* Thermal Monitoring */} - {hardwareDataSWR?.temperatures && hardwareDataSWR.temperatures.length > 0 && ( + {hardwareData?.temperatures && hardwareData.temperatures.length > 0 && (

Thermal Monitoring

- {hardwareDataSWR.temperatures.length} sensors + {hardwareData.temperatures.length} sensors
{/* CPU Sensors */} - {groupAndSortTemperatures(hardwareDataSWR.temperatures).CPU.length > 0 && ( + {groupAndSortTemperatures(hardwareData.temperatures).CPU.length > 0 && (

CPU

- {groupAndSortTemperatures(hardwareDataSWR.temperatures).CPU.length} + {groupAndSortTemperatures(hardwareData.temperatures).CPU.length}
- {groupAndSortTemperatures(hardwareDataSWR.temperatures).CPU.map((temp, index) => { + {groupAndSortTemperatures(hardwareData.temperatures).CPU.map((temp, index) => { const percentage = temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100 const isHot = temp.current > (temp.high || 80) @@ -643,21 +626,21 @@ export default function Hardware() { )} {/* GPU Sensors */} - {groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.length > 0 && ( + {groupAndSortTemperatures(hardwareData.temperatures).GPU.length > 0 && (
1 ? "md:col-span-2" : ""} + className={groupAndSortTemperatures(hardwareData.temperatures).GPU.length > 1 ? "md:col-span-2" : ""} >

GPU

- {groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.length} + {groupAndSortTemperatures(hardwareData.temperatures).GPU.length}
1 ? "md:grid-cols-2" : ""}`} + className={`grid gap-4 ${groupAndSortTemperatures(hardwareData.temperatures).GPU.length > 1 ? "md:grid-cols-2" : ""}`} > - {groupAndSortTemperatures(hardwareDataSWR.temperatures).GPU.map((temp, index) => { + {groupAndSortTemperatures(hardwareData.temperatures).GPU.map((temp, index) => { const percentage = temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100 const isHot = temp.current > (temp.high || 80) @@ -688,23 +671,23 @@ export default function Hardware() { )} {/* NVME Sensors */} - {groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.length > 0 && ( + {groupAndSortTemperatures(hardwareData.temperatures).NVME.length > 0 && (
1 ? "md:col-span-2" : "" + groupAndSortTemperatures(hardwareData.temperatures).NVME.length > 1 ? "md:col-span-2" : "" } >

NVME

- {groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.length} + {groupAndSortTemperatures(hardwareData.temperatures).NVME.length}
1 ? "md:grid-cols-2" : ""}`} + className={`grid gap-4 ${groupAndSortTemperatures(hardwareData.temperatures).NVME.length > 1 ? "md:grid-cols-2" : ""}`} > - {groupAndSortTemperatures(hardwareDataSWR.temperatures).NVME.map((temp, index) => { + {groupAndSortTemperatures(hardwareData.temperatures).NVME.map((temp, index) => { const percentage = temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100 const isHot = temp.current > (temp.high || 80) @@ -735,21 +718,21 @@ export default function Hardware() { )} {/* PCI Sensors */} - {groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.length > 0 && ( + {groupAndSortTemperatures(hardwareData.temperatures).PCI.length > 0 && (
1 ? "md:col-span-2" : ""} + className={groupAndSortTemperatures(hardwareData.temperatures).PCI.length > 1 ? "md:col-span-2" : ""} >

PCI

- {groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.length} + {groupAndSortTemperatures(hardwareData.temperatures).PCI.length}
1 ? "md:grid-cols-2" : ""}`} + className={`grid gap-4 ${groupAndSortTemperatures(hardwareData.temperatures).PCI.length > 1 ? "md:grid-cols-2" : ""}`} > - {groupAndSortTemperatures(hardwareDataSWR.temperatures).PCI.map((temp, index) => { + {groupAndSortTemperatures(hardwareData.temperatures).PCI.map((temp, index) => { const percentage = temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100 const isHot = temp.current > (temp.high || 80) @@ -780,23 +763,23 @@ export default function Hardware() { )} {/* OTHER Sensors */} - {groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.length > 0 && ( + {groupAndSortTemperatures(hardwareData.temperatures).OTHER.length > 0 && (
1 ? "md:col-span-2" : "" + groupAndSortTemperatures(hardwareData.temperatures).OTHER.length > 1 ? "md:col-span-2" : "" } >

OTHER

- {groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.length} + {groupAndSortTemperatures(hardwareData.temperatures).OTHER.length}
1 ? "md:grid-cols-2" : ""}`} + className={`grid gap-4 ${groupAndSortTemperatures(hardwareData.temperatures).OTHER.length > 1 ? "md:grid-cols-2" : ""}`} > - {groupAndSortTemperatures(hardwareDataSWR.temperatures).OTHER.map((temp, index) => { + {groupAndSortTemperatures(hardwareData.temperatures).OTHER.map((temp, index) => { const percentage = temp.critical > 0 ? (temp.current / temp.critical) * 100 : (temp.current / 100) * 100 const isHot = temp.current > (temp.high || 80) @@ -830,18 +813,18 @@ export default function Hardware() { )} {/* GPU Information - Enhanced with on-demand data fetching */} - {hardwareDataSWR?.gpus && hardwareDataSWR.gpus.length > 0 && ( + {hardwareData?.gpus && hardwareData.gpus.length > 0 && (

Graphics Cards

- {hardwareDataSWR.gpus.length} GPU{hardwareDataSWR.gpus.length > 1 ? "s" : ""} + {hardwareData.gpus.length} GPU{hardwareData.gpus.length > 1 ? "s" : ""}
- {hardwareDataSWR.gpus.map((gpu, index) => { + {hardwareData.gpus.map((gpu, index) => { const pciDevice = findPCIDeviceForGPU(gpu) const fullSlot = pciDevice?.slot || gpu.slot @@ -1324,7 +1307,7 @@ return ( {/* Power Consumption */} - {hardwareDataSWR?.power_meter && ( + {hardwareData?.power_meter && (
@@ -1334,13 +1317,13 @@ return (
-

{hardwareDataSWR.power_meter.name}

- {hardwareDataSWR.power_meter.adapter && ( -

{hardwareDataSWR.power_meter.adapter}

+

{hardwareData.power_meter.name}

+ {hardwareData.power_meter.adapter && ( +

{hardwareData.power_meter.adapter}

)}
-

{hardwareDataSWR.power_meter.watts.toFixed(1)} W

+

{hardwareData.power_meter.watts.toFixed(1)} W

Current Draw

@@ -1349,18 +1332,18 @@ return ( )} {/* Power Supplies */} - {hardwareDataSWR?.power_supplies && hardwareDataSWR.power_supplies.length > 0 && ( + {hardwareData?.power_supplies && hardwareData.power_supplies.length > 0 && (

Power Supplies

- {hardwareDataSWR.power_supplies.length} PSUs + {hardwareData.power_supplies.length} PSUs
- {hardwareDataSWR.power_supplies.map((psu, index) => ( + {hardwareData.power_supplies.map((psu, index) => (
{psu.name} @@ -1377,18 +1360,18 @@ return ( )} {/* Fans */} - {hardwareDataSWR?.fans && hardwareDataSWR.fans.length > 0 && ( + {hardwareData?.fans && hardwareData.fans.length > 0 && (

System Fans

- {hardwareDataSWR.fans.length} fans + {hardwareData.fans.length} fans
- {hardwareDataSWR.fans.map((fan, index) => { + {hardwareData.fans.map((fan, index) => { const isPercentage = fan.unit === "percent" || fan.unit === "%" const percentage = isPercentage ? fan.speed : Math.min((fan.speed / 5000) * 100, 100) @@ -1412,18 +1395,18 @@ return ( )} {/* UPS */} - {hardwareDataSWR?.ups && Array.isArray(hardwareDataSWR.ups) && hardwareDataSWR.ups.length > 0 && ( + {hardwareData?.ups && Array.isArray(hardwareData.ups) && hardwareData.ups.length > 0 && (

UPS Status

- {hardwareDataSWR.ups.length} UPS + {hardwareData.ups.length} UPS
- {hardwareDataSWR.ups.map((ups: any, index: number) => { + {hardwareData.ups.map((ups: any, index: number) => { const batteryCharge = ups.battery_charge_raw || Number.parseFloat(ups.battery_charge?.replace("%", "") || "0") const loadPercent = ups.load_percent_raw || Number.parseFloat(ups.load_percent?.replace("%", "") || "0") @@ -1694,18 +1677,18 @@ return ( {/* PCI Devices - Changed to modal */} - {hardwareDataSWR?.pci_devices && hardwareDataSWR.pci_devices.length > 0 && ( + {hardwareData?.pci_devices && hardwareData.pci_devices.length > 0 && (

PCI Devices

- {hardwareDataSWR.pci_devices.length} devices + {hardwareData.pci_devices.length} devices
- {hardwareDataSWR.pci_devices.map((device, index) => ( + {hardwareData.pci_devices.map((device, index) => (
setSelectedPCIDevice(device)} @@ -1787,19 +1770,19 @@ return ( {/* Network Summary - Clickable */} - {hardwareDataSWR?.pci_devices && - hardwareDataSWR.pci_devices.filter((d) => d.type.toLowerCase().includes("network")).length > 0 && ( + {hardwareData?.pci_devices && + hardwareData.pci_devices.filter((d) => d.type.toLowerCase().includes("network")).length > 0 && (

Network Summary

- {hardwareDataSWR.pci_devices.filter((d) => d.type.toLowerCase().includes("network")).length} interfaces + {hardwareData.pci_devices.filter((d) => d.type.toLowerCase().includes("network")).length} interfaces
- {hardwareDataSWR.pci_devices + {hardwareData.pci_devices .filter((d) => d.type.toLowerCase().includes("network")) .map((device, index) => (
{/* Storage Summary - Clickable */} - {hardwareDataSWR?.storage_devices && hardwareDataSWR.storage_devices.length > 0 && ( + {hardwareData?.storage_devices && hardwareData.storage_devices.length > 0 && (

Storage Summary

{ - hardwareDataSWR.storage_devices.filter( + hardwareData.storage_devices.filter( (device) => device.type === "disk" && !device.name.startsWith("zd") && !device.name.startsWith("loop"), ).length @@ -1896,7 +1879,7 @@ return (
- {hardwareDataSWR.storage_devices + {hardwareData.storage_devices .filter( (device) => device.type === "disk" && !device.name.startsWith("zd") && !device.name.startsWith("loop"), ) @@ -2239,12 +2222,12 @@ return ( description="Installing NVIDIA proprietary drivers for GPU monitoring..." onClose={() => { setNvidiaSessionId(null) - mutateHardware() + mutateStatic() }} onComplete={(success) => { console.log("[v0] NVIDIA installation completed:", success ? "success" : "failed") if (success) { - mutateHardware() + mutateStatic() } }} /> */} @@ -2252,7 +2235,7 @@ return ( open={showNvidiaInstaller} onClose={() => { setShowNvidiaInstaller(false) - mutateHardware() + mutateStatic() }} scriptPath="/usr/local/share/proxmenux/scripts/gpu_tpu/nvidia_installer.sh" scriptName="nvidia_installer" @@ -2266,7 +2249,7 @@ return ( open={showAmdInstaller} onClose={() => { setShowAmdInstaller(false) - mutateHardware() + mutateStatic() }} scriptPath="/usr/local/share/proxmenux/scripts/gpu_tpu/amd_gpu_tools.sh" scriptName="amd_gpu_tools" @@ -2280,7 +2263,7 @@ title="AMD GPU Tools Installation" open={showIntelInstaller} onClose={() => { setShowIntelInstaller(false) - mutateHardware() + mutateStatic() }} scriptPath="/usr/local/share/proxmenux/scripts/gpu_tpu/intel_gpu_tools.sh" scriptName="intel_gpu_tools" diff --git a/AppImage/components/system-overview.tsx b/AppImage/components/system-overview.tsx index 87c50dec..eec08512 100644 --- a/AppImage/components/system-overview.tsx +++ b/AppImage/components/system-overview.tsx @@ -222,7 +222,7 @@ export function SystemOverview() { const systemInterval = setInterval(async () => { const data = await fetchSystemData() if (data) setSystemData(data) - }, 9000) + }, 5000) const vmInterval = setInterval(async () => { const data = await fetchVMData() diff --git a/AppImage/components/virtual-machines.tsx b/AppImage/components/virtual-machines.tsx index 926c7236..faccabeb 100644 --- a/AppImage/components/virtual-machines.tsx +++ b/AppImage/components/virtual-machines.tsx @@ -295,10 +295,10 @@ export function VirtualMachines() { isLoading, mutate, } = useSWR("/api/vms", fetcher, { - refreshInterval: 23000, - revalidateOnFocus: false, + refreshInterval: 5000, + revalidateOnFocus: true, revalidateOnReconnect: true, - dedupingInterval: 10000, + dedupingInterval: 2000, errorRetryCount: 2, }) diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index f2d0dfd1..ed6a088c 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -1088,43 +1088,50 @@ def _health_collector_loop(): def _vital_signs_sampler(): - """Dedicated thread for rapid CPU & temperature sampling. - + """Dedicated thread for rapid CPU, memory & temperature sampling. + Runs independently of the 5-min health collector loop. - - CPU usage: sampled every 30s (3 samples in 1.5 min for hysteresis) + - CPU usage: sampled every 30s (10 samples in 5 min for sustained detection) + - Memory: sampled every 30s (10 samples in 5 min for sustained detection) - Temperature: sampled every 15s (12 samples in 3 min for temporal logic) Uses time.monotonic() to avoid drift. - - Staggered intervals: CPU at offset 0, Temp at offset 7s to avoid collision. + + Staggered intervals to avoid collision: CPU at 0, Temp at +7s, Mem at +15s. """ from health_monitor import health_monitor - + # Wait 15s after startup for sensors to be ready time.sleep(15) - + TEMP_INTERVAL = 15 # seconds (was 10s - reduced frequency by 33%) CPU_INTERVAL = 30 # seconds - - # Stagger: CPU starts immediately, Temp starts after 7s offset + MEM_INTERVAL = 30 # seconds (aligned with CPU for sustained-RAM detection) + + # Stagger: CPU starts immediately, Temp after 7s, Mem after 15s next_cpu = time.monotonic() next_temp = time.monotonic() + 7 - - print("[ProxMenux] Vital signs sampler started (CPU: 30s, Temp: 10s)") - + next_mem = time.monotonic() + 15 + + print("[ProxMenux] Vital signs sampler started (CPU: 30s, Mem: 30s, Temp: 15s)") + while True: try: now = time.monotonic() - + if now >= next_temp: health_monitor._sample_cpu_temperature() next_temp = now + TEMP_INTERVAL - + if now >= next_cpu: health_monitor._sample_cpu_usage() next_cpu = now + CPU_INTERVAL - + + if now >= next_mem: + health_monitor._sample_memory_usage() + next_mem = now + MEM_INTERVAL + # Sleep until the next earliest event (with 0.5s min to avoid busy-loop) - sleep_until = min(next_temp, next_cpu) - time.monotonic() + sleep_until = min(next_temp, next_cpu, next_mem) - time.monotonic() time.sleep(max(sleep_until, 0.5)) except Exception as e: print(f"[ProxMenux] Vital signs sampler error: {e}") @@ -1160,7 +1167,7 @@ _pvesh_cache = { 'storage_list': None, 'storage_list_time': 0, } -_PVESH_CACHE_TTL = 30 # 30 seconds - balances freshness with performance +_PVESH_CACHE_TTL = 5 # 5 seconds - near real-time for active UI; pvesh local cost is ~200-400ms # Cache for sensors output (temperature readings) _sensors_cache = { @@ -1169,6 +1176,15 @@ _sensors_cache = { } _SENSORS_CACHE_TTL = 10 # 10 seconds - temperature changes slowly +# Cache for ipmitool sensor output (shared between fans, power supplies, power meter) +# ipmitool is slow (1-3s per call) and was called twice per /api/hardware hit. +_ipmi_cache = { + 'output': None, + 'time': 0, + 'unavailable': False, # set True if ipmitool is missing, avoid retrying +} +_IPMI_CACHE_TTL = 10 # 10 seconds + # Cache for hardware info (lspci, dmidecode, lsblk) _hardware_cache = { 'lspci': None, @@ -3820,13 +3836,42 @@ def get_proxmox_vms(): # Return empty array instead of error object - frontend expects array return [] -def get_ipmi_fans(): - """Get fan information from IPMI""" - fans = [] +def get_cached_ipmi_sensors(): + """Get ipmitool sensor output with 10s cache. Shared between fans/power parsers. + + Returns empty string if ipmitool is unavailable (cached to avoid repeated FileNotFoundError). + """ + global _ipmi_cache + now = time.time() + + if _ipmi_cache['unavailable']: + return '' + + if _ipmi_cache['output'] is not None and \ + now - _ipmi_cache['time'] < _IPMI_CACHE_TTL: + return _ipmi_cache['output'] + try: result = subprocess.run(['ipmitool', 'sensor'], capture_output=True, text=True, timeout=10) if result.returncode == 0: - for line in result.stdout.split('\n'): + _ipmi_cache['output'] = result.stdout + _ipmi_cache['time'] = now + return result.stdout + except FileNotFoundError: + _ipmi_cache['unavailable'] = True + return '' + except Exception: + pass + return _ipmi_cache['output'] or '' + + +def get_ipmi_fans(): + """Get fan information from IPMI (uses cached sensor output).""" + fans = [] + try: + output = get_cached_ipmi_sensors() + if output: + for line in output.split('\n'): if 'fan' in line.lower() and '|' in line: parts = [p.strip() for p in line.split('|')] if len(parts) >= 3: @@ -3862,14 +3907,14 @@ def get_ipmi_fans(): return fans def get_ipmi_power(): - """Get power supply information from IPMI""" + """Get power supply information from IPMI (uses cached sensor output).""" power_supplies = [] power_meter = None - + try: - result = subprocess.run(['ipmitool', 'sensor'], capture_output=True, text=True, timeout=10) - if result.returncode == 0: - for line in result.stdout.split('\n'): + output = get_cached_ipmi_sensors() + if output: + for line in output.split('\n'): if ('power supply' in line.lower() or 'power meter' in line.lower()) and '|' in line: parts = [p.strip() for p in line.split('|')] if len(parts) >= 3: @@ -4202,7 +4247,97 @@ def identify_fan(sensor_name, adapter, chip_name=None): return sensor_name # Default: return original name - return sensor_name + return sensor_name + + +def _parse_sensor_fans(sensors_output): + """Parse fan entries from `sensors` output. Extracted for reuse between + get_hardware_info (static full payload) and get_hardware_live_info (live endpoint).""" + fans = [] + if not sensors_output: + return fans + current_adapter = None + current_chip = None + for line in sensors_output.split('\n'): + line = line.strip() + if not line: + continue + if not ':' in line and not line.startswith(' ') and not line.startswith('Adapter'): + current_chip = line + continue + if line.startswith('Adapter:'): + current_adapter = line.replace('Adapter:', '').strip() + continue + if ':' in line and not line.startswith(' '): + parts = line.split(':', 1) + sensor_name = parts[0].strip() + value_part = parts[1].strip() + if 'RPM' in value_part: + rpm_match = re.search(r'([\d.]+)\s*RPM', value_part) + if rpm_match: + fan_speed = int(float(rpm_match.group(1))) + identified_name = identify_fan(sensor_name, current_adapter, current_chip) + fans.append({ + 'name': identified_name, + 'original_name': sensor_name, + 'speed': fan_speed, + 'unit': 'RPM', + 'adapter': current_adapter + }) + return fans + + +def get_hardware_live_info(): + """Build only the live/dynamic hardware fields for /api/hardware/live. + + Skips all the heavy static collection (lscpu, dmidecode, lsblk, smartctl, lspci...). + Uses cached sensors + cached ipmitool output to stay cheap under 5s polling. + """ + result = { + 'temperatures': [], + 'fans': [], + 'power_meter': None, + 'power_supplies': [], + 'ups': None, + } + + try: + temp_info = get_temperature_info() + result['temperatures'] = temp_info.get('temperatures', []) + result['power_meter'] = temp_info.get('power_meter') + except Exception: + pass + + try: + sensor_fans = _parse_sensor_fans(get_cached_sensors_output()) + except Exception: + sensor_fans = [] + + try: + ipmi_fans = get_ipmi_fans() + except Exception: + ipmi_fans = [] + + result['fans'] = sensor_fans + ipmi_fans + + try: + ipmi_power = get_ipmi_power() + if ipmi_power: + result['power_supplies'] = ipmi_power.get('power_supplies', []) + # Fallback: if sensors didn't provide a power_meter, use IPMI's + if result['power_meter'] is None and ipmi_power.get('power_meter'): + result['power_meter'] = ipmi_power['power_meter'] + except Exception: + pass + + try: + ups_info = get_ups_info() + if ups_info: + result['ups'] = ups_info + except Exception: + pass + + return result def get_temperature_info(): @@ -6102,52 +6237,8 @@ def get_hardware_info(): pass try: - sensors_output = get_cached_sensors_output() - if sensors_output: - current_adapter = None - current_chip = None # Add chip name tracking - fans = [] - - for line in sensors_output.split('\n'): - line = line.strip() - if not line: - continue - - # Chip names don't have ":" and are not indented - if not ':' in line and not line.startswith(' ') and not line.startswith('Adapter'): - current_chip = line - continue - - # Detect adapter line - if line.startswith('Adapter:'): - current_adapter = line.replace('Adapter:', '').strip() - continue - - # Parse fan sensors - if ':' in line and not line.startswith(' '): - parts = line.split(':', 1) - sensor_name = parts[0].strip() - value_part = parts[1].strip() - - # Look for fan sensors (RPM) - if 'RPM' in value_part: - rpm_match = re.search(r'([\d.]+)\s*RPM', value_part) - if rpm_match: - fan_speed = int(float(rpm_match.group(1))) - - identified_name = identify_fan(sensor_name, current_adapter, current_chip) - - fans.append({ - 'name': identified_name, - 'original_name': sensor_name, - 'speed': fan_speed, - 'unit': 'RPM', - 'adapter': current_adapter - }) - - hardware_data['sensors']['fans'] = fans - except Exception as e: - # print(f"[v0] Error getting fan sensors: {e}") + hardware_data['sensors']['fans'] = _parse_sensor_fans(get_cached_sensors_output()) + except Exception: pass # Power Supply / UPS @@ -6226,7 +6317,9 @@ def get_hardware_info(): def api_system(): """Get system information including CPU, memory, and temperature""" try: - cpu_usage = psutil.cpu_percent(interval=0.5) + # Non-blocking: returns %CPU since the last psutil call (sampler or prior API hit). + # The background vital-signs sampler keeps psutil's internal state primed. + cpu_usage = psutil.cpu_percent(interval=0) memory = psutil.virtual_memory() memory_used_gb = memory.used / (1024 ** 3) @@ -9286,6 +9379,23 @@ def api_hardware(): traceback.print_exc() return jsonify({'error': str(e)}), 500 +@app.route('/api/hardware/live', methods=['GET']) +@require_auth +def api_hardware_live(): + """Lightweight endpoint: only dynamic hardware fields (temps, fans, power, UPS). + + Designed for the active Hardware page to poll every 3-5s without re-running the + expensive static collectors (lscpu, dmidecode, lsblk, smartctl). ipmitool output + is cached internally (10s) so repeated polls don't hammer the BMC. + """ + try: + return jsonify(get_hardware_live_info()) + except Exception as e: + import traceback + traceback.print_exc() + return jsonify({'error': str(e)}), 500 + + @app.route('/api/gpu//realtime', methods=['GET']) @require_auth def api_gpu_realtime(slot): @@ -9526,8 +9636,11 @@ def api_vm_control(vmid): control_result = subprocess.run( ['pvesh', 'create', f'/nodes/{node}/{vm_type}/{vmid}/status/{action}'], capture_output=True, text=True, timeout=30) - + if control_result.returncode == 0: + # Invalidate VM resources cache so the next /api/vms call + # returns fresh status instead of the pre-action snapshot. + _pvesh_cache['cluster_resources_vm_time'] = 0 return jsonify({ 'success': True, 'vmid': vmid, diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 24c7d97d..36b37e3f 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -67,7 +67,7 @@ class HealthMonitor: # Memory Thresholds MEMORY_WARNING = 85 MEMORY_CRITICAL = 95 - MEMORY_DURATION = 60 + MEMORY_DURATION = 300 # 5 minutes sustained (aligned with CPU) SWAP_WARNING_DURATION = 300 SWAP_CRITICAL_PERCENT = 5 SWAP_CRITICAL_DURATION = 120 @@ -402,6 +402,30 @@ class HealthMonitor: except Exception: pass # Sampling must never crash the thread + def _sample_memory_usage(self): + """Lightweight memory sample: read RAM/swap % and append to history. ~1ms cost.""" + try: + memory = psutil.virtual_memory() + swap = psutil.swap_memory() + current_time = time.time() + mem_percent = memory.percent + swap_percent = swap.percent if swap.total > 0 else 0 + swap_vs_ram = (swap.used / memory.total * 100) if memory.total > 0 else 0 + state_key = 'memory_usage' + self.state_history[state_key].append({ + 'mem_percent': mem_percent, + 'swap_percent': swap_percent, + 'swap_vs_ram': swap_vs_ram, + 'time': current_time + }) + # Prune entries older than 10 minutes + self.state_history[state_key] = [ + e for e in self.state_history[state_key] + if current_time - e['time'] < 600 + ] + except Exception: + pass # Sampling must never crash the thread + def _sample_cpu_temperature(self): """Lightweight temperature sample: read sensor and append to history. ~50ms cost.""" try: @@ -1050,34 +1074,46 @@ class HealthMonitor: if current_time - entry['time'] < 600 ] - mem_critical = sum( - 1 for entry in self.state_history[state_key] + mem_critical_samples = [ + entry for entry in self.state_history[state_key] if entry['mem_percent'] >= 90 and current_time - entry['time'] <= self.MEMORY_DURATION - ) - - mem_warning = sum( - 1 for entry in self.state_history[state_key] + ] + + mem_warning_samples = [ + entry for entry in self.state_history[state_key] if entry['mem_percent'] >= self.MEMORY_WARNING and current_time - entry['time'] <= self.MEMORY_DURATION - ) - + ] + swap_critical = sum( 1 for entry in self.state_history[state_key] if entry['swap_vs_ram'] > 20 and current_time - entry['time'] <= self.SWAP_CRITICAL_DURATION ) - - - if mem_critical >= 2: + + # Require sustained high usage across most of the 300s window. + # With ~30s sampling: 300s = ~10 samples, so 8 ≈ 240s sustained. + # Mirrors CPU's ~83% coverage threshold (25/30). + MEM_CRITICAL_MIN_SAMPLES = 8 + MEM_WARNING_MIN_SAMPLES = 8 + + mem_critical_count = len(mem_critical_samples) + mem_warning_count = len(mem_warning_samples) + + if mem_critical_count >= MEM_CRITICAL_MIN_SAMPLES: + oldest = min(s['time'] for s in mem_critical_samples) + actual_duration = int(current_time - oldest) status = 'CRITICAL' - reason = f'RAM >90% for {self.MEMORY_DURATION}s' + reason = f'RAM >90% sustained for {actual_duration}s' elif swap_critical >= 2: status = 'CRITICAL' reason = f'Swap >20% of RAM ({swap_vs_ram:.1f}%)' - elif mem_warning >= 2: + elif mem_warning_count >= MEM_WARNING_MIN_SAMPLES: + oldest = min(s['time'] for s in mem_warning_samples) + actual_duration = int(current_time - oldest) status = 'WARNING' - reason = f'RAM >{self.MEMORY_WARNING}% for {self.MEMORY_DURATION}s' + reason = f'RAM >{self.MEMORY_WARNING}% sustained for {actual_duration}s' else: status = 'OK' reason = None @@ -1088,7 +1124,7 @@ class HealthMonitor: swap_total_gb = round(swap.total / (1024**3), 2) # Determine per-sub-check status - ram_status = 'CRITICAL' if mem_percent >= 90 and mem_critical >= 2 else ('WARNING' if mem_percent >= self.MEMORY_WARNING and mem_warning >= 2 else 'OK') + ram_status = 'CRITICAL' if mem_percent >= 90 and mem_critical_count >= MEM_CRITICAL_MIN_SAMPLES else ('WARNING' if mem_percent >= self.MEMORY_WARNING and mem_warning_count >= MEM_WARNING_MIN_SAMPLES else 'OK') swap_status = 'CRITICAL' if swap_critical >= 2 else 'OK' result = {