mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-05-13 20:45:01 +00:00
452 lines
18 KiB
Python
452 lines
18 KiB
Python
"""User-configurable Health Monitor thresholds.
|
|
|
|
Until now every threshold the Health Monitor (and the notification stack
|
|
that hangs off it) compares against was a hardcoded constant in
|
|
``health_monitor.py`` and a few helper modules. Operators repeatedly
|
|
asked for the ability to tune them per host — for example, a small
|
|
homelab user is fine with the rootfs filling to 92 % before being
|
|
nagged, while a production node owner wants the alert at 80 %.
|
|
|
|
This module is the single source of truth for those thresholds. The
|
|
JSON file at ``/usr/local/share/proxmenux/health_thresholds.json``
|
|
holds only the *overrides* the user has made; anything missing falls
|
|
back to the recommended default below. That keeps forward compatibility
|
|
trivial: new thresholds added in a later version are absent from older
|
|
JSON files and just resolve to their recommended value.
|
|
|
|
Public surface:
|
|
|
|
DEFAULTS — nested dict of recommended values + per-field metadata
|
|
get(section, key) — read effective value (override or default)
|
|
load() — return the user-configured overrides (no defaults applied)
|
|
load_effective() — return a fully-merged config (defaults + overrides)
|
|
save(payload) — validate & persist a partial or full config
|
|
reset_section(s) — clear all overrides for one section
|
|
reset_all() — wipe every override
|
|
invalidate_cache()— force the next ``get`` to re-read from disk
|
|
|
|
Every public function is safe to call from request handlers and from
|
|
the background health collector concurrently. A 5-second in-memory
|
|
cache avoids disk reads on the hot path; the cache is invalidated on
|
|
save/reset.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import threading
|
|
import time
|
|
from typing import Any, Optional
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Recommended defaults + metadata
|
|
#
|
|
# Each leaf entry is a dict with at least ``value``. The other keys
|
|
# describe validation and UI hints so the frontend can render the
|
|
# right input type without round-tripping schema info separately.
|
|
#
|
|
# Sections are designed to match the UI subsections one-to-one:
|
|
# cpu — CPU usage %
|
|
# memory — RAM and swap %
|
|
# host_storage — host filesystems (rootfs, /var/lib/vz, /mnt/*)
|
|
# lxc_rootfs — per-CT root disk %
|
|
# cpu_temperature — CPU °C
|
|
# disk_temperature — per-disk-class °C (hdd / ssd / nvme / sas)
|
|
#
|
|
# Phase 3 will add: lxc_mount, pve_storage, zfs_pool.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
DEFAULTS: dict[str, Any] = {
|
|
"cpu": {
|
|
"warning": {"value": 85, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
"critical": {"value": 95, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
},
|
|
"memory": {
|
|
"warning": {"value": 85, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
"critical": {"value": 95, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
"swap_critical": {"value": 5, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
},
|
|
"host_storage": {
|
|
"warning": {"value": 85, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
"critical": {"value": 95, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
},
|
|
"lxc_rootfs": {
|
|
"warning": {"value": 85, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
"critical": {"value": 95, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
},
|
|
"cpu_temperature": {
|
|
"warning": {"value": 80, "unit": "°C", "min": 30, "max": 120, "step": 1},
|
|
"critical": {"value": 90, "unit": "°C", "min": 30, "max": 120, "step": 1},
|
|
},
|
|
"disk_temperature": {
|
|
"hdd": {
|
|
"warning": {"value": 60, "unit": "°C", "min": 30, "max": 100, "step": 1},
|
|
"critical": {"value": 65, "unit": "°C", "min": 30, "max": 100, "step": 1},
|
|
},
|
|
"ssd": {
|
|
"warning": {"value": 70, "unit": "°C", "min": 30, "max": 100, "step": 1},
|
|
"critical": {"value": 75, "unit": "°C", "min": 30, "max": 100, "step": 1},
|
|
},
|
|
"nvme": {
|
|
"warning": {"value": 80, "unit": "°C", "min": 30, "max": 110, "step": 1},
|
|
"critical": {"value": 85, "unit": "°C", "min": 30, "max": 110, "step": 1},
|
|
},
|
|
"sas": {
|
|
"warning": {"value": 55, "unit": "°C", "min": 30, "max": 100, "step": 1},
|
|
"critical": {"value": 65, "unit": "°C", "min": 30, "max": 100, "step": 1},
|
|
},
|
|
},
|
|
# ── Phase 3: capacity checks added in this sprint ──────────────────
|
|
# These three sections drive new health checks that didn't exist
|
|
# before. Defaults match the host-storage thresholds so users who
|
|
# never customise see consistent alerting across all storage layers.
|
|
"lxc_mount": {
|
|
# Capacity of mountpoints inside running LXCs (mp0, mp1, NFS,
|
|
# bind mounts, etc.). Excludes pseudo-filesystems and the CT
|
|
# rootfs (already covered by `lxc_rootfs`).
|
|
"warning": {"value": 85, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
"critical": {"value": 95, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
},
|
|
"pve_storage": {
|
|
# Capacity of PVE-registered storages that are not surfaced as
|
|
# a host filesystem (LVM/LVM-thin/RBD/ZFS-pool/PBS). Filesystem
|
|
# storages (dir/nfs/cifs) are already covered by `host_storage`
|
|
# via the underlying mount.
|
|
"warning": {"value": 85, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
"critical": {"value": 95, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
},
|
|
"zfs_pool": {
|
|
# ZFS pool fill level via `zpool list -H -p -o capacity`. Runs
|
|
# independently of PVE so pools that aren't registered as PVE
|
|
# storage (e.g. rpool, dedicated backup pools) still get
|
|
# monitored.
|
|
"warning": {"value": 85, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
"critical": {"value": 95, "unit": "%", "min": 1, "max": 100, "step": 1},
|
|
},
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Storage & cache
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_DB_DIR = "/usr/local/share/proxmenux"
|
|
_CONFIG_PATH = os.path.join(_DB_DIR, "health_thresholds.json")
|
|
|
|
_CACHE_TTL = 5 # seconds — cheap enough to skip disk reads on every comparison
|
|
_lock = threading.Lock()
|
|
_cache: dict[str, Any] = {"data": None, "time": 0.0}
|
|
|
|
|
|
def _read_disk() -> dict:
|
|
"""Load the JSON override file. Returns {} on first run / missing /
|
|
parse error so callers always see a valid dict."""
|
|
try:
|
|
with open(_CONFIG_PATH, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
return data if isinstance(data, dict) else {}
|
|
except (FileNotFoundError, IsADirectoryError, PermissionError):
|
|
return {}
|
|
except (OSError, json.JSONDecodeError) as e:
|
|
print(f"[ProxMenux] health_thresholds: read failed ({e}); using defaults")
|
|
return {}
|
|
|
|
|
|
def _write_disk(data: dict) -> bool:
|
|
"""Persist the override dict atomically (write-and-rename so a
|
|
crash mid-write can't leave a half-written JSON behind)."""
|
|
try:
|
|
os.makedirs(_DB_DIR, exist_ok=True)
|
|
tmp = _CONFIG_PATH + ".tmp"
|
|
with open(tmp, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
f.flush()
|
|
os.fsync(f.fileno())
|
|
os.replace(tmp, _CONFIG_PATH)
|
|
return True
|
|
except OSError as e:
|
|
print(f"[ProxMenux] health_thresholds: write failed: {e}")
|
|
return False
|
|
|
|
|
|
def invalidate_cache() -> None:
|
|
"""Force the next ``get`` to re-read from disk."""
|
|
with _lock:
|
|
_cache["data"] = None
|
|
_cache["time"] = 0.0
|
|
|
|
|
|
def _cached_overrides() -> dict:
|
|
"""Return the current overrides dict, hitting disk at most every
|
|
``_CACHE_TTL`` seconds. Lock ensures multiple threads don't race
|
|
to read the same file."""
|
|
now = time.time()
|
|
with _lock:
|
|
if _cache["data"] is None or now - _cache["time"] >= _CACHE_TTL:
|
|
_cache["data"] = _read_disk()
|
|
_cache["time"] = now
|
|
return _cache["data"]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public read API
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def get(section: str, *path: str, default: Optional[float] = None) -> Optional[float]:
|
|
"""Read an effective threshold value.
|
|
|
|
Examples::
|
|
|
|
get("cpu", "warning") -> 85 (or user override)
|
|
get("disk_temperature", "nvme", "warning") -> 80 (or override)
|
|
|
|
Order: user override (if present and valid) → recommended default →
|
|
the ``default`` argument. Returns a number, not the metadata dict.
|
|
"""
|
|
overrides = _cached_overrides()
|
|
|
|
# Walk the override tree
|
|
node: Any = overrides
|
|
for p in (section,) + path:
|
|
if not isinstance(node, dict):
|
|
node = None
|
|
break
|
|
node = node.get(p)
|
|
if isinstance(node, (int, float)):
|
|
return float(node)
|
|
|
|
# Fall back to recommended
|
|
node = DEFAULTS
|
|
for p in (section,) + path:
|
|
if not isinstance(node, dict):
|
|
return default
|
|
node = node.get(p)
|
|
if node is None:
|
|
return default
|
|
if isinstance(node, dict) and "value" in node:
|
|
return float(node["value"])
|
|
if isinstance(node, (int, float)):
|
|
return float(node)
|
|
return default
|
|
|
|
|
|
def load() -> dict:
|
|
"""Return the raw user overrides (no defaults merged in). Use this
|
|
for the GET endpoint when the frontend wants to know what's
|
|
customised vs untouched."""
|
|
return _cached_overrides()
|
|
|
|
|
|
def load_effective() -> dict:
|
|
"""Return a fully-merged tree (defaults + overrides), shaped like
|
|
DEFAULTS but with the leaf ``value`` replaced by the effective
|
|
threshold and an extra ``customised`` boolean per leaf."""
|
|
overrides = _cached_overrides()
|
|
|
|
def merge(default_node: Any, override_node: Any) -> Any:
|
|
if isinstance(default_node, dict) and "value" in default_node:
|
|
# Leaf
|
|
ov = override_node if isinstance(override_node, (int, float)) else None
|
|
return {
|
|
**default_node,
|
|
"value": float(ov) if ov is not None else default_node["value"],
|
|
"recommended": default_node["value"],
|
|
"customised": ov is not None,
|
|
}
|
|
if isinstance(default_node, dict):
|
|
ov_dict = override_node if isinstance(override_node, dict) else {}
|
|
return {k: merge(v, ov_dict.get(k)) for k, v in default_node.items()}
|
|
return default_node
|
|
|
|
return merge(DEFAULTS, overrides)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Validation + write API
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class ThresholdValidationError(ValueError):
|
|
"""Raised when a save() payload violates the defaults' min/max range."""
|
|
|
|
|
|
def _validate(section: str, path: tuple[str, ...], value: Any) -> float:
|
|
"""Resolve metadata for the given leaf path, coerce ``value`` to
|
|
float, and check it against min/max. Raises ThresholdValidationError
|
|
on any problem."""
|
|
meta: Any = DEFAULTS
|
|
for p in (section,) + path:
|
|
if not isinstance(meta, dict) or p not in meta:
|
|
raise ThresholdValidationError(f"Unknown threshold: {section}.{'.'.join(path)}")
|
|
meta = meta[p]
|
|
if not isinstance(meta, dict) or "value" not in meta:
|
|
raise ThresholdValidationError(f"Path {section}.{'.'.join(path)} is not a leaf")
|
|
|
|
try:
|
|
v = float(value)
|
|
except (TypeError, ValueError):
|
|
raise ThresholdValidationError(
|
|
f"{section}.{'.'.join(path)} must be a number, got {value!r}"
|
|
)
|
|
|
|
if v != v or v in (float("inf"), float("-inf")):
|
|
raise ThresholdValidationError(f"{section}.{'.'.join(path)}: NaN/Inf not allowed")
|
|
|
|
lo = meta.get("min")
|
|
hi = meta.get("max")
|
|
if lo is not None and v < lo:
|
|
raise ThresholdValidationError(
|
|
f"{section}.{'.'.join(path)}: {v} < min {lo}"
|
|
)
|
|
if hi is not None and v > hi:
|
|
raise ThresholdValidationError(
|
|
f"{section}.{'.'.join(path)}: {v} > max {hi}"
|
|
)
|
|
return v
|
|
|
|
|
|
def _walk_and_validate(payload: dict, defaults_subtree: Any, path: tuple[str, ...]) -> dict:
|
|
"""Recursively walk ``payload`` mirroring ``defaults_subtree``'s
|
|
shape. Returns a clean dict with only valid leaves and validated
|
|
floats, or raises on the first problem."""
|
|
cleaned: dict[str, Any] = {}
|
|
if not isinstance(defaults_subtree, dict):
|
|
return cleaned
|
|
for key, value in payload.items():
|
|
if key not in defaults_subtree:
|
|
raise ThresholdValidationError(f"Unknown key: {'.'.join(path + (key,))}")
|
|
sub_default = defaults_subtree[key]
|
|
if isinstance(sub_default, dict) and "value" in sub_default:
|
|
# Leaf — validate value
|
|
cleaned[key] = _validate(path[0], path[1:] + (key,), value)
|
|
elif isinstance(sub_default, dict):
|
|
if not isinstance(value, dict):
|
|
raise ThresholdValidationError(
|
|
f"{'.'.join(path + (key,))} expected dict, got {type(value).__name__}"
|
|
)
|
|
sub = _walk_and_validate(value, sub_default, path + (key,))
|
|
if sub:
|
|
cleaned[key] = sub
|
|
return cleaned
|
|
|
|
|
|
def save(payload: dict) -> dict:
|
|
"""Validate and persist a partial or full payload. Only the keys
|
|
present in ``payload`` are touched — existing overrides for other
|
|
sections survive. Returns the new effective tree (same shape as
|
|
``load_effective``).
|
|
|
|
Raises ThresholdValidationError on any invalid value; nothing is
|
|
persisted in that case.
|
|
|
|
Sanity rules beyond min/max are enforced here too:
|
|
- critical >= warning for every section that has both
|
|
"""
|
|
if not isinstance(payload, dict):
|
|
raise ThresholdValidationError("payload must be an object")
|
|
|
|
# Walk and produce a cleaned, fully-validated subset
|
|
new_overrides: dict[str, Any] = {}
|
|
for section_key, section_payload in payload.items():
|
|
if section_key not in DEFAULTS:
|
|
raise ThresholdValidationError(f"Unknown section: {section_key}")
|
|
if not isinstance(section_payload, dict):
|
|
raise ThresholdValidationError(f"Section {section_key} must be an object")
|
|
cleaned = _walk_and_validate(section_payload, DEFAULTS[section_key], (section_key,))
|
|
if cleaned:
|
|
new_overrides[section_key] = cleaned
|
|
|
|
# Cross-field check: critical must not be lower than warning.
|
|
# Computed against the *effective* tree (existing overrides + this
|
|
# payload + defaults) so a partial save like "only warning=70" is
|
|
# checked against the existing critical value.
|
|
existing = _cached_overrides()
|
|
merged = _merge_overrides(existing, new_overrides)
|
|
_check_warn_le_crit(merged)
|
|
|
|
# Merge into the on-disk overrides (preserve sections not touched
|
|
# by this payload). Empty values inside cleaned mean "remove that
|
|
# leaf" — handled by _merge_overrides.
|
|
final = _merge_overrides(existing, new_overrides)
|
|
|
|
if not _write_disk(final):
|
|
raise ThresholdValidationError("Failed to persist thresholds to disk")
|
|
|
|
invalidate_cache()
|
|
return load_effective()
|
|
|
|
|
|
def _merge_overrides(existing: dict, incoming: dict) -> dict:
|
|
"""Deep-merge ``incoming`` into ``existing``. Keys in ``incoming``
|
|
overwrite; keys absent from ``incoming`` are preserved from
|
|
``existing``."""
|
|
out: dict[str, Any] = {k: v for k, v in existing.items() if isinstance(v, dict)}
|
|
# Also copy non-dict roots verbatim (shouldn't exist, but be tolerant)
|
|
for k, v in existing.items():
|
|
if k not in out:
|
|
out[k] = v
|
|
for k, v in incoming.items():
|
|
if isinstance(v, dict) and isinstance(out.get(k), dict):
|
|
out[k] = _merge_overrides(out[k], v)
|
|
else:
|
|
out[k] = v
|
|
return out
|
|
|
|
|
|
def _check_warn_le_crit(merged: dict) -> None:
|
|
"""Enforce critical >= warning for every section/sub-section that
|
|
exposes both. ``merged`` is a flat overrides tree — we walk both
|
|
it and DEFAULTS to resolve the effective values."""
|
|
|
|
def effective(node_default: Any, node_over: Any, key: str) -> Optional[float]:
|
|
if isinstance(node_over, dict) and isinstance(node_over.get(key), (int, float)):
|
|
return float(node_over[key])
|
|
leaf = node_default.get(key) if isinstance(node_default, dict) else None
|
|
if isinstance(leaf, dict) and "value" in leaf:
|
|
return float(leaf["value"])
|
|
return None
|
|
|
|
def walk(default_subtree: Any, override_subtree: Any, path_str: str) -> None:
|
|
if not isinstance(default_subtree, dict):
|
|
return
|
|
# If this dict has both "warning" and "critical" leaves, check.
|
|
if "warning" in default_subtree and "critical" in default_subtree and \
|
|
isinstance(default_subtree["warning"], dict) and "value" in default_subtree["warning"]:
|
|
warn = effective(default_subtree, override_subtree, "warning")
|
|
crit = effective(default_subtree, override_subtree, "critical")
|
|
if warn is not None and crit is not None and crit < warn:
|
|
raise ThresholdValidationError(
|
|
f"{path_str}: critical ({crit}) must be >= warning ({warn})"
|
|
)
|
|
# Recurse into nested groups (disk_temperature.hdd etc.)
|
|
for k, v in default_subtree.items():
|
|
if isinstance(v, dict) and "value" not in v:
|
|
ov = override_subtree.get(k) if isinstance(override_subtree, dict) else None
|
|
walk(v, ov, f"{path_str}.{k}" if path_str else k)
|
|
|
|
for section, section_default in DEFAULTS.items():
|
|
ov = merged.get(section, {})
|
|
walk(section_default, ov, section)
|
|
|
|
|
|
def reset_section(section: str) -> dict:
|
|
"""Drop every override under ``section`` (so it falls back to
|
|
recommended). Returns the new effective tree."""
|
|
if section not in DEFAULTS:
|
|
raise ThresholdValidationError(f"Unknown section: {section}")
|
|
existing = _cached_overrides()
|
|
if section in existing:
|
|
existing = {k: v for k, v in existing.items() if k != section}
|
|
if not _write_disk(existing):
|
|
raise ThresholdValidationError("Failed to persist thresholds to disk")
|
|
invalidate_cache()
|
|
return load_effective()
|
|
|
|
|
|
def reset_all() -> dict:
|
|
"""Wipe every override; everything falls back to recommended."""
|
|
if not _write_disk({}):
|
|
raise ThresholdValidationError("Failed to persist thresholds to disk")
|
|
invalidate_cache()
|
|
return load_effective()
|