Update AppImage 1.2.1.2

2026-05-22 16:44:48 +00:00 · 2026-05-21 21:17:59 +02:00
parent 3e9dd599a6
commit f5b7a0a74b
7 changed files with 367 additions and 38 deletions
@@ -152,13 +152,65 @@ def _get_jwt_secret():
    config = load_auth_config()
    sec = config.get("jwt_secret")
    if isinstance(sec, str) and len(sec) >= 32:
+        _audit_api_tokens_against_jwt_secret(sec)
        return sec
    new_secret = secrets.token_urlsafe(48)
    config["jwt_secret"] = new_secret
    save_auth_config(config)
+    _audit_api_tokens_against_jwt_secret(new_secret)
    return new_secret


+# One-shot startup audit: warn the operator (in journal) when stored
+# api_tokens were minted under a previous jwt_secret. Those tokens
+# remain in `api_tokens` metadata but their JWTs no longer verify, so
+# the user's HTTP client (Home Assistant, custom script, …) gets a 401
+# while the token "looks valid" in the UI. We log once per process to
+# make the failure mode searchable in journalctl without spamming.
+_TOKEN_AUDIT_DONE = False
+_TOKEN_AUDIT_LOCK = threading.Lock()
+
+
+def _audit_api_tokens_against_jwt_secret(current_secret: str) -> None:
+    """One-time warning when stored api_tokens were signed under a
+    previous jwt_secret. Cheap: returns immediately after the first
+    successful run. Logs to stdout/stderr so the message lands in the
+    Monitor's journalctl output.
+    """
+    global _TOKEN_AUDIT_DONE
+    with _TOKEN_AUDIT_LOCK:
+        if _TOKEN_AUDIT_DONE:
+            return
+        _TOKEN_AUDIT_DONE = True
+
+    try:
+        config = load_auth_config()
+        tokens = config.get("api_tokens", [])
+        if not tokens:
+            return
+        current_fp = hashlib.sha256(current_secret.encode()).hexdigest()[:16]
+        stale = [t for t in tokens
+                 if t.get("signed_with") is not None
+                 and t.get("signed_with") != current_fp]
+        legacy = [t for t in tokens if t.get("signed_with") is None]
+        if stale:
+            ids = ", ".join(t.get("id", "?") for t in stale)
+            print(f"[ProxMenux][auth] WARNING: {len(stale)} API token(s) "
+                  f"signed with a previous jwt_secret — they will return "
+                  f"401 'Invalid or expired token'. Revoke and regenerate "
+                  f"from Settings → API Tokens. Affected IDs: {ids}")
+        if legacy:
+            ids = ", ".join(t.get("id", "?") for t in legacy)
+            print(f"[ProxMenux][auth] NOTE: {len(legacy)} API token(s) "
+                  f"have no signing-secret fingerprint (created before "
+                  f"the tracking field was added). Their validity can "
+                  f"only be confirmed by an actual auth attempt. "
+                  f"Legacy IDs: {ids}")
+    except Exception as e:
+        # Audit is best-effort — failure must never break startup.
+        print(f"[ProxMenux][auth] token audit skipped: {e}")
+
+
 # Server-side mirror of the frontend's `validatePasswordStrength`. Defense
 # in depth: the UI enforces these rules but a direct API caller (curl,
 # scripted setup, custom client) bypasses the JS — so the same minimum has
@@ -419,24 +471,45 @@ def verify_token(token):
        return None


+def _jwt_secret_fingerprint(secret: str = None) -> str:
+    """Stable fingerprint of the active jwt_secret.
+
+    First 16 hex chars of SHA256(secret). Used to detect whether a stored
+    api-token was minted under the *current* jwt_secret or under a
+    previous one (in which case the JWT can no longer be verified).
+    Never returns the secret itself.
+    """
+    sec = secret if secret is not None else _get_jwt_secret()
+    if not sec:
+        return ""
+    return hashlib.sha256(sec.encode()).hexdigest()[:16]
+
+
 def store_api_token_metadata(token, token_name="API Token"):
    """
    Store API token metadata (hash, name, creation date) for listing and revocation.
    The actual token is never stored - only a hash for identification.
+
+    Also records the fingerprint of the jwt_secret that minted this token
+    (`signed_with`). At list time we compare this against the current
+    fingerprint so the UI can flag tokens whose signing secret has been
+    rotated since — those JWTs no longer verify and the operator needs
+    to regenerate them (see `list_api_tokens`).
    """
    config = load_auth_config()
    token_hash = hashlib.sha256(token.encode()).hexdigest()
    token_id = token_hash[:16]
-    
+
    token_entry = {
        "id": token_id,
        "name": token_name,
        "token_hash": token_hash,
        "token_prefix": token[:12] + "...",
        "created_at": datetime.utcnow().isoformat() + "Z",
-        "expires_at": (datetime.utcnow() + timedelta(days=365)).isoformat() + "Z"
+        "expires_at": (datetime.utcnow() + timedelta(days=365)).isoformat() + "Z",
+        "signed_with": _jwt_secret_fingerprint(),
    }
-    
+
    config.setdefault("api_tokens", [])
    config["api_tokens"].append(token_entry)
    save_auth_config(config)
@@ -444,24 +517,56 @@ def store_api_token_metadata(token, token_name="API Token"):


 def list_api_tokens():
-    """
-    List all stored API token metadata (no actual tokens are returned).
-    Returns list of token entries with id, name, prefix, creation and expiration dates.
+    """List stored API token metadata (no actual tokens are returned).
+
+    Each entry carries:
+      * `revoked`  — token hash is in the revocation list.
+      * `valid`    — JWT can still be verified with the current secret.
+                     `True` when `signed_with` matches the current
+                     fingerprint, `False` when it doesn't (jwt_secret
+                     rotated → JWT signature broken), `None` for legacy
+                     entries created before this field existed (status
+                     can only be confirmed by attempting a verify with
+                     the real token, which we never see at list time).
+      * `invalidation_reason` — human-readable explanation when
+                                `valid is False`, otherwise absent.
+
+    The UI uses these flags to flag tokens that look stored but no
+    longer authenticate — preventing the "I have the token but it
+    returns 401" rabbit hole.
    """
    config = load_auth_config()
    tokens = config.get("api_tokens", [])
    revoked = set(config.get("revoked_tokens", []))
-    
+    current_fp = _jwt_secret_fingerprint()
+
    result = []
    for t in tokens:
+        signed_with = t.get("signed_with")
+        if signed_with is None:
+            valid = None  # legacy entry — unknown
+            reason = None
+        elif signed_with == current_fp:
+            valid = True
+            reason = None
+        else:
+            valid = False
+            reason = ("Signed with a previous jwt_secret. The signing "
+                      "secret has been rotated since this token was "
+                      "issued — its JWT can no longer be verified. "
+                      "Revoke this token and generate a new one.")
+
        entry = {
            "id": t.get("id"),
            "name": t.get("name", "API Token"),
            "token_prefix": t.get("token_prefix", "***"),
            "created_at": t.get("created_at"),
            "expires_at": t.get("expires_at"),
-            "revoked": t.get("token_hash") in revoked
+            "revoked": t.get("token_hash") in revoked,
+            "valid": valid,
        }
+        if reason:
+            entry["invalidation_reason"] = reason
        result.append(entry)
    return result

@@ -7,6 +7,31 @@ ProxMenux Flask Server
 - Integrates a web terminal powered by xterm.js
 """

+# ─── gevent monkey-patch — MUST be the first executable code ─────────────
+#
+# When SSL is enabled we serve the dashboard with `gevent.pywsgi.WSGIServer`.
+# Without `monkey.patch_all()` gevent runs as a single-threaded cooperative
+# event loop: a request that calls `subprocess.run(pvesh ...)` blocks the
+# whole event loop, so every other request lined up in parallel returns 502
+# until that subprocess finishes. The frontend's `/api/vms` page fires 3-4
+# parallel requests on mount, which is exactly the symptom that surfaced as
+# "first load 502, second load fine" under HTTPS.
+#
+# `patch_all()` replaces stdlib blocking primitives (socket, subprocess,
+# select, threading, ssl, time.sleep, ...) with gevent-friendly equivalents
+# that yield to the event loop instead of blocking it. This must run BEFORE
+# any other import touches those primitives — otherwise the unpatched
+# versions get bound in the module and the patch is silently ineffective.
+#
+# Wrapped in a try/except so a host without gevent installed (HTTP-only
+# mode) still imports cleanly: the patch is only meaningful when gevent is
+# actually being used as the WSGI server.
+try:
+    from gevent import monkey
+    monkey.patch_all()
+except ImportError:
+    pass
+
 import glob
 import json
 import logging
@@ -136,12 +136,30 @@ class NotificationEvent:
        return f"NotificationEvent({self.event_type}, {self.severity}, fp={self.fingerprint[:40]})"


+_HOSTNAME_CACHE: Dict[str, Any] = {'value': None, 'ts': 0.0}
+_HOSTNAME_CACHE_TTL = 5.0  # seconds
+
+
 def _hostname() -> str:
    """Get display hostname for notifications.
-    
+
    Returns the custom display name from notification settings if configured,
-    otherwise falls back to the system hostname.
+    otherwise falls back to the system FQDN (NOT truncated at the first dot —
+    a host called ``px.seeindustry.com`` is rendered in full so multi-host
+    deployments stay distinguishable).
+
+    Reads are cached for ~5 s so a burst of events (~tens per cycle) doesn't
+    hit the SQLite settings table on every call. The TTL is short enough that
+    a freshly-saved alias takes effect within seconds without restarting the
+    service — fixes the original behaviour where `self._hostname = _hostname()`
+    was cached in `__init__` and never refreshed.
    """
+    now = time.time()
+    cached = _HOSTNAME_CACHE.get('value')
+    if cached is not None and (now - _HOSTNAME_CACHE['ts']) < _HOSTNAME_CACHE_TTL:
+        return cached
+
+    resolved = ''
    # Try to read custom display name from notification settings
    try:
        db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
@@ -156,15 +174,24 @@ def _hostname() -> str:
            row = cursor.fetchone()
            conn.close()
            if row and row[0] and row[0].strip():
-                return row[0].strip()
+                resolved = row[0].strip()
    except Exception:
        pass  # Fall back to system hostname
-    
-    # Fall back to system hostname
-    try:
-        return socket.gethostname().split('.')[0]
-    except Exception:
-        return 'proxmox'
+
+    if not resolved:
+        # Use FULL FQDN — never truncate at the first dot. The previous
+        # `.split('.')[0]` produced misleading bare labels like "px" when the
+        # alias was missing or unreadable, with no way for the operator to
+        # tell which of their `px.*.example.com` nodes the notification came
+        # from. The Display Name (alias) remains the recommended override.
+        try:
+            resolved = socket.gethostname()
+        except Exception:
+            resolved = 'proxmox'
+
+    _HOSTNAME_CACHE['value'] = resolved
+    _HOSTNAME_CACHE['ts'] = now
+    return resolved


 def capture_journal_context(keywords: list, lines: int = 30,
@@ -376,7 +403,10 @@ class JournalWatcher:
        self._running = False
        self._thread: Optional[threading.Thread] = None
        self._process: Optional[subprocess.Popen] = None
-        self._hostname = _hostname()
+        # `_hostname` is exposed as a @property below so every read returns
+        # the *current* alias from the settings DB (TTL-cached for 5 s in
+        # _hostname()). The old `__init__`-time cache made a fresh Display
+        # Name require a service restart to take effect.
        
        # Dedup: track recent events to avoid duplicates
        self._recent_events: Dict[str, float] = {}
@@ -421,10 +451,14 @@ class JournalWatcher:
        # so we can suppress per-guest "Starting Backup of VM ..." noise
        self._last_backup_job_ts: float = 0
        self._BACKUP_JOB_SUPPRESS_WINDOW = 7200  # 2h: suppress per-guest during active job
-        
+
        # NOTE: Service failure batching is handled universally by
        # BurstAggregator in NotificationManager (AGGREGATION_RULES).
-    
+
+    @property
+    def _hostname(self) -> str:
+        return _hostname()
+
    def start(self):
        """Start the journal watcher thread."""
        if self._running:
@@ -1752,7 +1786,10 @@ class TaskWatcher:
        self._queue = event_queue
        self._running = False
        self._thread: Optional[threading.Thread] = None
-        self._hostname = _hostname()
+        # `_hostname` is exposed as a @property below so every read returns
+        # the *current* alias from the settings DB (TTL-cached for 5 s in
+        # _hostname()). The old `__init__`-time cache made a fresh Display
+        # Name require a service restart to take effect.
        self._last_position = 0
        # Cache for active vzdump detection
        self._vzdump_active_cache: float = 0  # timestamp of last positive check
@@ -1765,12 +1802,16 @@ class TaskWatcher:
        self._vzdump_grace_period = 120  # seconds after vzdump ends to still suppress
        # Track active-file UPIDs we've already seen, to avoid duplicate backup_start
        self._seen_active_upids: set = set()
-    
+
+    @property
+    def _hostname(self) -> str:
+        return _hostname()
+
    def start(self):
        if self._running:
            return
        self._running = True
-        
+
        # Start at end of file
        if os.path.exists(self.TASK_LOG):
            try:
@@ -2263,7 +2304,10 @@ class PollingCollector:
        self._running = False
        self._thread: Optional[threading.Thread] = None
        self._poll_interval = poll_interval
-        self._hostname = _hostname()
+        # `_hostname` is exposed as a @property below so every read returns
+        # the *current* alias from the settings DB (TTL-cached for 5 s in
+        # _hostname()). The old `__init__`-time cache made a fresh Display
+        # Name require a service restart to take effect.
        self._last_update_check = 0
        self._last_proxmenux_check = 0
        self._last_ai_model_check = 0
@@ -2312,7 +2356,11 @@ class PollingCollector:
        # subprocess per disk-with-error per poll cycle. Key: bare device
        # name (no /dev/). Value: bool (True = USB).
        self._is_usb_cache: Dict[str, bool] = {}
-    
+
+    @property
+    def _hostname(self) -> str:
+        return _hostname()
+
    def start(self):
        if self._running:
            return
@@ -3703,8 +3751,15 @@ class ProxmoxHookWatcher:
    
    def __init__(self, event_queue: Queue):
        self._queue = event_queue
-        self._hostname = _hostname()
-    
+        # `_hostname` is exposed as a @property below so every read returns
+        # the *current* alias from the settings DB (TTL-cached for 5 s in
+        # _hostname()). The old `__init__`-time cache made a fresh Display
+        # Name require a service restart to take effect.
+
+    @property
+    def _hostname(self) -> str:
+        return _hostname()
+
    def process_webhook(self, payload: dict) -> dict:
        """Process an incoming Proxmox webhook payload.