diff --git a/AppImage/ProxMenux-1.2.1.2-beta.AppImage b/AppImage/ProxMenux-1.2.1.2-beta.AppImage index 25175139..d8ae4717 100755 Binary files a/AppImage/ProxMenux-1.2.1.2-beta.AppImage and b/AppImage/ProxMenux-1.2.1.2-beta.AppImage differ diff --git a/AppImage/ProxMenux-Monitor.AppImage.sha256 b/AppImage/ProxMenux-Monitor.AppImage.sha256 index 20b53588..110ef6d5 100644 --- a/AppImage/ProxMenux-Monitor.AppImage.sha256 +++ b/AppImage/ProxMenux-Monitor.AppImage.sha256 @@ -1 +1 @@ -69aed2fd5627fa4542f1cd26163aa2eda5843f7e1a23d259298a173d8e377c36 ProxMenux-1.2.1.2-beta.AppImage +0e75481fb3e466dff46e60776fdfe874c17dd653f492c11c6214e3e410013441 ProxMenux-1.2.1.2-beta.AppImage diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index 62139d88..2753f9af 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -1372,7 +1372,15 @@ _pvesh_cache = { 'storage_list': None, 'storage_list_time': 0, } -_PVESH_CACHE_TTL = 2 # 2 seconds - near real-time, single consistent data source for list + modal +_PVESH_CACHE_TTL = 5 # 5 seconds — Sprint 14.7: bumped from 2s. +# At 2 s the cache routinely thrashed when two parallel callers raced +# the first `pvesh` (~1 s under gevent without monkey-patch on PVE 9): +# both saw an empty cache, both spawned a `pvesh`, and the dashboard +# served stale data interleaved with 502s. 5 s comfortably covers the +# longest pvesh response while still feeling real-time for the UI +# (which refreshes its widgets every 10–30 s). Coupled with the +# in-flight lock below this also prevents thundering-herd against PVE. +_pvesh_cluster_resources_lock = threading.Lock() # Cache for sensors output (temperature readings) _sensors_cache = { @@ -1414,7 +1422,14 @@ _HARDWARE_CACHE_TTL = 300 # 5 minutes - hardware doesn't change def get_cached_pvesh_cluster_resources_vm(): - """Get cluster VM resources with 30s cache.""" + """Get cluster VM resources with a per-process cache. + + `_PVESH_CACHE_TTL` controls cache freshness; `_pvesh_cluster_resources_lock` + serialises in-flight calls so a thundering herd of dashboard fetches + after a TTL expiry results in ONE `pvesh` instead of N. After the + holder of the lock finishes, every other caller picks up the freshly + populated cache and returns immediately. + """ global _pvesh_cache now = time.time() @@ -1422,19 +1437,29 @@ def get_cached_pvesh_cluster_resources_vm(): now - _pvesh_cache['cluster_resources_vm_time'] < _PVESH_CACHE_TTL: return _pvesh_cache['cluster_resources_vm'] - try: - result = subprocess.run( - ['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], - capture_output=True, text=True, timeout=10 - ) - if result.returncode == 0: - data = json.loads(result.stdout) - _pvesh_cache['cluster_resources_vm'] = data - _pvesh_cache['cluster_resources_vm_time'] = now - return data - except Exception: - pass - return _pvesh_cache['cluster_resources_vm'] or [] + with _pvesh_cluster_resources_lock: + # Re-check inside the lock — another thread may have refreshed + # while we were waiting for our turn. This is the single-flight + # pattern: only the first thread runs `pvesh`, everyone else + # benefits from the same result without spawning duplicates. + now = time.time() + if _pvesh_cache['cluster_resources_vm'] is not None and \ + now - _pvesh_cache['cluster_resources_vm_time'] < _PVESH_CACHE_TTL: + return _pvesh_cache['cluster_resources_vm'] + + try: + result = subprocess.run( + ['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'], + capture_output=True, text=True, timeout=10 + ) + if result.returncode == 0: + data = json.loads(result.stdout) + _pvesh_cache['cluster_resources_vm'] = data + _pvesh_cache['cluster_resources_vm_time'] = now + return data + except Exception: + pass + return _pvesh_cache['cluster_resources_vm'] or [] # Sprint 14 perf pass: three backup endpoints (`/api/backups`, @@ -9468,21 +9493,53 @@ def api_vm_metrics(vmid): return jsonify({'error': str(e)}), 500 +# Per-process cache for the RRD payload of /api/node/metrics. Two unrelated +# dashboard components (`network-traffic-chart` for the network panel and +# `node-metrics-charts` for the CPU/memory panel) mount in parallel on the +# Overview page and each fires this endpoint independently with the same +# `?timeframe=` argument. The underlying `pvesh get rrddata` call takes +# ~1 second; without a cache, the second fetch blocks behind the first +# (especially under gevent), occasionally surfacing as a transient 502 +# while gevent is single-threaded for blocking calls. RRD data is updated +# on a per-minute cadence by PVE, so a 10-second cache is safe and the +# UI experience is materially better. +_NODE_METRICS_CACHE = {} +_NODE_METRICS_TTL = 10.0 # seconds + + +def _node_metrics_cache_get(timeframe): + entry = _NODE_METRICS_CACHE.get(timeframe) + if not entry: + return None + if time.monotonic() - entry['ts'] > _NODE_METRICS_TTL: + return None + return entry['payload'] + + +def _node_metrics_cache_set(timeframe, payload): + _NODE_METRICS_CACHE[timeframe] = {'payload': payload, 'ts': time.monotonic()} + + @app.route('/api/node/metrics', methods=['GET']) @require_auth def api_node_metrics(): - """Get historical metrics (RRD data) for the node""" + """Get historical metrics (RRD data) for the node. + + Per-timeframe cached for ~10 s so the two dashboard panels that mount + together don't hit `pvesh` twice; see `_NODE_METRICS_CACHE` comment. + """ try: timeframe = request.args.get('timeframe', 'week') # hour, day, week, month, year - - # Validate timeframe valid_timeframes = ['hour', 'day', 'week', 'month', 'year'] if timeframe not in valid_timeframes: - # print(f"[v0] ERROR: Invalid timeframe: {timeframe}") - pass return jsonify({'error': f'Invalid timeframe. Must be one of: {", ".join(valid_timeframes)}'}), 400 + + # Serve from cache when fresh — completely skips the pvesh call. + cached = _node_metrics_cache_get(timeframe) + if cached is not None: + return jsonify(cached) # Get local node name # local_node = socket.gethostname() @@ -9513,18 +9570,20 @@ def api_node_metrics(): if rrd_result.returncode == 0: rrd_data = json.loads(rrd_result.stdout) - + if zfs_arc_size > 0: for item in rrd_data: # If zfsarc field is missing or 0, add current value if 'zfsarc' not in item or item.get('zfsarc', 0) == 0: item['zfsarc'] = zfs_arc_size - - return jsonify({ + + payload = { 'node': local_node, 'timeframe': timeframe, 'data': rrd_data - }) + } + _node_metrics_cache_set(timeframe, payload) + return jsonify(payload) else: # Check if RRD file is empty or corrupted stderr_lower = rrd_result.stderr.lower() if rrd_result.stderr else ''