"""Sprint 14: per-disk temperature history. Mirrors the CPU ``temperature_history`` infrastructure in flask_server, but keyed by disk name so each physical drive gets its own time series. Same SQLite DB (``/usr/local/share/proxmenux/monitor.db``), same 30-day retention, same downsampling buckets the CPU history endpoint uses (hour=raw / day=5min / week=30min / month=2h). The sampler is a single function meant to be called once per minute from flask_server's existing ``_temperature_collector_loop``, so we don't add another background thread. Performance — three caches keep the steady-state cost flat on big JBODs: * ``_disk_list_cache`` — lsblk + USB filter, refreshed every 5 min. * ``_disk_probe_cache`` — remembers which ``smartctl -d `` variant works for each disk so we skip the 4-attempt fallback chain. * ``_disk_fail_backoff`` — drives that never report a temperature are rate-limited to one re-probe per hour instead of every minute. The actual smartctl calls run in a ThreadPoolExecutor, so a 24-disk host spends ~max(per-disk time) per sample instead of sum. """ from __future__ import annotations import json import os import re import sqlite3 import subprocess import threading import time from concurrent.futures import ThreadPoolExecutor from typing import Any, Optional # Use the same DB the CPU temperature pipeline writes to so we share # the WAL file and the periodic vacuum that flask_server already runs. _DB_DIR = "/usr/local/share/proxmenux" _DB_PATH = os.path.join(_DB_DIR, "monitor.db") # Retention window for raw samples. Matches CPU history. _RETENTION_DAYS = 30 # How long ``lsblk`` and each ``smartctl`` call are allowed to run. # A single hung drive should not block the rest of the batch. _LSBLK_TIMEOUT = 5 _SMARTCTL_TIMEOUT = 5 # --------------------------------------------------------------------------- # Caching strategy (Sprint 14 perf pass) # # On a 24-disk host the naive sampler can spend several seconds per minute # just iterating smartctl. Three caches keep the steady-state cost flat: # # _disk_list_cache — the (lsblk + USB filter) result. Disks don't # appear/disappear between samples, so we only # re-enumerate every _DISK_LIST_TTL seconds. # # _disk_probe_cache — once we know `/dev/sdX` answers to e.g. the # `-d sat` invocation, we skip the other 3 # fallback variants on every subsequent sample. # # _disk_fail_backoff — drives that consistently report no temperature # (USB-bridges that don't pass SMART through, # virtual SR-IOV NVMe namespaces, etc.) get # backed off for a long window so we don't keep # re-probing them every minute. # # All three are guarded by a single lock — contention is irrelevant because # the sampler runs once a minute, but the cache is also read by request # handlers that can race with the collector. # --------------------------------------------------------------------------- _DISK_LIST_TTL = 300 # 5 minutes _FAIL_BACKOFF_SECONDS = 3600 # 1 hour _FAIL_THRESHOLD = 3 # consecutive failures before backoff kicks in _MAX_WORKERS = 16 # cap concurrency for huge JBODs _cache_lock = threading.Lock() _disk_list_cache: Optional[tuple[float, list[str]]] = None # Maps disk_name -> probe key: 'auto' | 'nvme' | 'ata' | 'sat'. # Only successful probes get cached. _disk_probe_cache: dict[str, str] = {} # Maps disk_name -> consecutive_failures count (cleared on success). _disk_fail_counts: dict[str, int] = {} # Maps disk_name -> next-allowed-retry timestamp once backoff trips. _disk_fail_backoff: dict[str, float] = {} def _invalidate_disk_list_cache() -> None: """Force the next sample to re-run lsblk. Call this from anywhere that knows topology has changed (hot-swap, manual rescan, etc.).""" global _disk_list_cache with _cache_lock: _disk_list_cache = None def reset_disk_caches() -> None: """Drop every cached entry. Useful for diagnostics and tests.""" global _disk_list_cache with _cache_lock: _disk_list_cache = None _disk_probe_cache.clear() _disk_fail_counts.clear() _disk_fail_backoff.clear() def get_cache_stats() -> dict[str, Any]: """Snapshot of the internal caches — surfaced via flask_server for operators to confirm the optimisations are doing what they should.""" now = time.time() with _cache_lock: list_cached = _disk_list_cache is not None and _disk_list_cache[0] > now list_size = len(_disk_list_cache[1]) if _disk_list_cache else 0 list_expires_in = max(0, int(_disk_list_cache[0] - now)) if _disk_list_cache else 0 return { "disk_list": { "cached": list_cached, "size": list_size, "expires_in_seconds": list_expires_in, "ttl_seconds": _DISK_LIST_TTL, }, "probe_cache": dict(_disk_probe_cache), "fail_counts": dict(_disk_fail_counts), "backoff": { d: max(0, int(retry - now)) for d, retry in _disk_fail_backoff.items() if retry > now }, "max_workers": _MAX_WORKERS, } def _db_connect() -> sqlite3.Connection: conn = sqlite3.connect(_DB_PATH, timeout=5) conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA synchronous=NORMAL") return conn def init_disk_temperature_db() -> bool: """Create the table + index. Idempotent — safe to call on every AppImage start.""" try: os.makedirs(_DB_DIR, exist_ok=True) conn = _db_connect() conn.execute( """ CREATE TABLE IF NOT EXISTS disk_temperature_history ( id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp INTEGER NOT NULL, disk_name TEXT NOT NULL, value REAL NOT NULL ) """ ) # Composite index — queries always filter by disk_name + timestamp. conn.execute( """ CREATE INDEX IF NOT EXISTS idx_disk_temp_disk_ts ON disk_temperature_history(disk_name, timestamp) """ ) conn.commit() conn.close() return True except Exception as e: print(f"[ProxMenux] Disk temperature DB init failed: {e}") return False # --------------------------------------------------------------------------- # Disk enumeration + temperature read # --------------------------------------------------------------------------- # Match the modal's filter: USB drives are excluded. The hardware tab # already hides them in the per-disk list and the user's cluster # storage doesn't run on USB-attached disks anyway. Including them # would clutter the history table for thumbdrives plugged in once # during a recovery session. def _is_usb_disk(disk_name: str) -> bool: """Return True for disks attached over USB. Mirrors the heuristic in `get_disk_connection_type` in flask_server — checks the realpath of /sys/block/ for `usb` in the bus chain.""" try: link = os.path.realpath(f"/sys/block/{disk_name}") return "/usb" in link except OSError: return False def _enumerate_target_disks() -> list[str]: """Run ``lsblk`` + USB filter. The expensive part is the realpath walks in ``_is_usb_disk``; both are short-lived but we still amortise them via the disk-list cache so they only run every few minutes.""" out: list[str] = [] try: proc = subprocess.run( ["lsblk", "-d", "-n", "-o", "NAME,TYPE"], capture_output=True, text=True, timeout=_LSBLK_TIMEOUT, ) if proc.returncode != 0: return out for line in proc.stdout.strip().splitlines(): parts = line.split() if len(parts) < 2: continue name, dtype = parts[0], parts[1] if dtype != "disk": continue # Skip virtual/loop devices that lsblk still reports as type=disk. if name.startswith("loop") or name.startswith("zd"): continue if _is_usb_disk(name): continue out.append(name) except (subprocess.TimeoutExpired, OSError): pass return out def _list_target_disks() -> list[str]: """Cached wrapper around ``_enumerate_target_disks``. Topology is re-read every ``_DISK_LIST_TTL`` seconds; in between we serve the list from memory.""" global _disk_list_cache now = time.time() with _cache_lock: if _disk_list_cache is not None and _disk_list_cache[0] > now: return list(_disk_list_cache[1]) fresh = _enumerate_target_disks() with _cache_lock: _disk_list_cache = (now + _DISK_LIST_TTL, list(fresh)) return fresh def _smartctl_cmd_for(disk_name: str, probe: str) -> list[str]: """Build the smartctl invocation for a given probe key.""" cmd = ["smartctl", "-A", "-j"] if probe != "auto": cmd.extend(["-d", probe]) cmd.append(f"/dev/{disk_name}") return cmd def _try_probe(disk_name: str, probe: str) -> Optional[float]: """Run a single smartctl invocation and parse the temperature.""" try: proc = subprocess.run( _smartctl_cmd_for(disk_name, probe), capture_output=True, text=True, timeout=_SMARTCTL_TIMEOUT, ) # smartctl returns non-zero on warnings (bit 0x40 etc.) even when # JSON is fully populated. Don't gate on returncode — parse the # body regardless. if not proc.stdout: return None data = json.loads(proc.stdout) return _extract_temperature(data) except (subprocess.TimeoutExpired, OSError, json.JSONDecodeError): return None def _read_temperature(disk_name: str) -> Optional[float]: """Pull the current temperature from ``smartctl -A -j``. Caching strategy: * If we've previously found a working probe for this disk we go straight to it — no fallback chain. * If the probe-cache entry stops working (kernel upgrade swapped the auto-detect path, etc.) we fall through to the full chain and update the cache with whatever does work. * Disks that never report a temperature get rate-limited via the backoff table so we don't smartctl them every minute forever. """ now = time.time() # Backoff: skip drives that recently failed too many times. with _cache_lock: retry_at = _disk_fail_backoff.get(disk_name, 0) cached_probe = _disk_probe_cache.get(disk_name) if retry_at > now: return None # Fast path: cached probe. if cached_probe is not None: temp = _try_probe(disk_name, cached_probe) if temp is not None and temp > 0: with _cache_lock: _disk_fail_counts.pop(disk_name, None) _disk_fail_backoff.pop(disk_name, None) return temp # Cached probe stopped working — fall through and re-detect. # Slow path: try every probe and remember the first one that works. for probe in ("auto", "nvme", "ata", "sat"): if probe == cached_probe: continue # already tried above temp = _try_probe(disk_name, probe) if temp is not None and temp > 0: with _cache_lock: _disk_probe_cache[disk_name] = probe _disk_fail_counts.pop(disk_name, None) _disk_fail_backoff.pop(disk_name, None) return temp # All probes failed. Bump the failure counter and trip the backoff # if we've crossed the threshold. with _cache_lock: n = _disk_fail_counts.get(disk_name, 0) + 1 _disk_fail_counts[disk_name] = n if n >= _FAIL_THRESHOLD: _disk_fail_backoff[disk_name] = now + _FAIL_BACKOFF_SECONDS # Drop the stale probe cache so the next attempt re-detects. _disk_probe_cache.pop(disk_name, None) return None def _extract_temperature(data: dict[str, Any]) -> Optional[float]: """Pull the current temperature out of the smartctl JSON payload. smartctl exposes temperature in different places depending on disk class: - SATA/SAS: ``temperature.current`` - NVMe: ``nvme_smart_health_information_log.temperature`` (in K on some firmwares, °C on most modern ones — 250 is the sentinel for "value too high to be plausible degrees C", treat as Kelvin) - SAS legacy: ``ata_smart_attributes.table[id=190 or 194]`` """ # Modern path — works for almost every disk class. cur = data.get("temperature", {}).get("current") if isinstance(cur, (int, float)): return float(cur) # NVMe-specific path. nvme = data.get("nvme_smart_health_information_log", {}) if isinstance(nvme, dict): n_temp = nvme.get("temperature") if isinstance(n_temp, (int, float)): # Some NVMe firmwares report Kelvin (273.15+). Anything > 200 # has to be Kelvin since no SSD survives 200 °C. return float(n_temp - 273) if n_temp > 200 else float(n_temp) # Legacy ATA SMART attribute table fallback. ata = data.get("ata_smart_attributes", {}) if isinstance(ata, dict): for row in ata.get("table", []) or []: try: attr_id = row.get("id") if attr_id in (190, 194): raw = row.get("raw", {}).get("value") if isinstance(raw, (int, float)) and 0 < raw < 200: return float(raw) except (AttributeError, TypeError): continue return None # --------------------------------------------------------------------------- # Public API — sampler + history query # --------------------------------------------------------------------------- def record_all_disk_temperatures() -> int: """Sample every non-USB disk and persist its temperature. Sampling fans out across a thread pool so a host with N disks pays roughly the time of the slowest single ``smartctl`` call instead of N × that. ``smartctl`` is mostly waiting on a kernel IOCTL, so threading is enough — no need for asyncio. Returns the number of rows actually written. """ disks = _list_target_disks() if not disks: return 0 now = int(time.time()) workers = min(len(disks), _MAX_WORKERS) rows: list[tuple[int, str, float]] = [] try: with ThreadPoolExecutor(max_workers=workers, thread_name_prefix="disktemp") as pool: for disk_name, temp in zip(disks, pool.map(_read_temperature, disks)): if temp is None or temp <= 0: continue rows.append((now, disk_name, round(temp, 1))) except Exception as e: # If the pool itself blows up, log and bail — better to skip a # sample than to crash the collector loop. print(f"[ProxMenux] Disk temperature pool failed: {e}") return 0 if not rows: return 0 try: conn = _db_connect() conn.executemany( "INSERT INTO disk_temperature_history (timestamp, disk_name, value) VALUES (?, ?, ?)", rows, ) conn.commit() conn.close() return len(rows) except Exception as e: print(f"[ProxMenux] Disk temperature record failed: {e}") return 0 def cleanup_old_disk_temperature_data() -> None: """Drop rows older than the retention window. Cheap — runs in milliseconds against the indexed timestamp column.""" try: cutoff = int(time.time()) - (_RETENTION_DAYS * 86400) conn = _db_connect() conn.execute( "DELETE FROM disk_temperature_history WHERE timestamp < ?", (cutoff,), ) conn.commit() conn.close() except Exception: pass # Whitelist regex for disk names to make sure a malicious URL parameter # can never trip the SQL or land arbitrary text in WHERE clauses. The # module is otherwise parameterised, so this is belt-and-braces. _DISK_NAME_RE = re.compile(r"^[a-zA-Z0-9_-]+$") def get_disk_temperature_history(disk_name: str, timeframe: str = "hour") -> dict[str, Any]: """Return per-disk history with the same shape and downsampling as the CPU temperature endpoint. Timeframes: - hour: last 1 h, raw points (~60) - day: last 24 h, 5-minute averages (288 points) - week: last 7 days, 30-minute averages (336 points) - month: last 30 days, 2-hour averages (360 points) """ empty = {"data": [], "stats": {"min": 0, "max": 0, "avg": 0, "current": 0}} if not _DISK_NAME_RE.match(disk_name or ""): return empty now = int(time.time()) if timeframe == "day": since, interval = now - 86400, 300 elif timeframe == "week": since, interval = now - 7 * 86400, 1800 elif timeframe == "month": since, interval = now - 30 * 86400, 7200 else: # hour or unknown since, interval = now - 3600, None try: conn = _db_connect() if interval is None: cursor = conn.execute( """ SELECT timestamp, value FROM disk_temperature_history WHERE disk_name = ? AND timestamp >= ? ORDER BY timestamp ASC """, (disk_name, since), ) rows = cursor.fetchall() data = [{"timestamp": r[0], "value": r[1]} for r in rows] else: cursor = conn.execute( """ SELECT (timestamp / ?) * ? as bucket, ROUND(AVG(value), 1) as avg_val, ROUND(MIN(value), 1) as min_val, ROUND(MAX(value), 1) as max_val FROM disk_temperature_history WHERE disk_name = ? AND timestamp >= ? GROUP BY bucket ORDER BY bucket ASC """, (interval, interval, disk_name, since), ) rows = cursor.fetchall() data = [ {"timestamp": r[0], "value": r[1], "min": r[2], "max": r[3]} for r in rows ] conn.close() except Exception: return empty if not data: return empty values = [d["value"] for d in data] if interval is not None and "min" in data[0]: actual_min = min(d["min"] for d in data) actual_max = max(d["max"] for d in data) else: actual_min = min(values) actual_max = max(values) stats = { "min": round(actual_min, 1), "max": round(actual_max, 1), "avg": round(sum(values) / len(values), 1), "current": values[-1], } return {"data": data, "stats": stats}