ProxMenux/AppImage/scripts/managed_installs.py

"""ProxMenux-managed installs registry.

Single source of truth for "things ProxMenux installed (or detected as
already installed) and can check for updates on". Replaces the
type-specific polling we had before — every check now flows through
this module, so adding a new tracked install (Coral driver, Frigate,
etc.) is one entry in DETECTORS + one entry in CHECKERS.

Two operation modes:

* **Detection** — at AppImage startup and every 24h, every registered
  ``DETECTOR`` runs against the host. If the probe finds the thing
  installed and it's not in the registry, we add it (with
  ``installed_by="detected"`` so the operator sees we autodiscovered
  it). If it's in the registry but the probe fails, we mark it
  ``removed_at`` instead of deleting — keeps history and avoids
  spurious notifications when a probe transiently fails.

* **Update check** — for every active entry, the matching ``CHECKER``
  runs and updates ``current_version`` + ``available`` + ``latest``.
  Each checker is responsible for its own per-source cache (the
  Tailscale OCI checker memoises for 24h, NVIDIA for 7 days). The
  notification poll loop reads the registry, emits a notification when
  ``available`` flips false→true for a (type, latest) pair it hasn't
  notified yet.

Persistence is a single JSON file at
``/usr/local/share/proxmenux/managed_installs.json``. Atomic writes
via tmp+rename so a crash mid-write can't leave a half-written file.

The module is concurrency-safe: a single ``threading.RLock`` guards
every read-modify-write so the periodic detector and a request handler
calling ``get_registry()`` can run in parallel without stepping on
each other.
"""

from __future__ import annotations

import datetime
import json
import os
import re
import subprocess
import threading
import time
import urllib.request
from typing import Any, Callable, Optional

# ─── Storage ──────────────────────────────────────────────────────────────────

_DB_DIR = "/usr/local/share/proxmenux"
_REGISTRY_PATH = os.path.join(_DB_DIR, "managed_installs.json")
_SCHEMA_VERSION = 1

_lock = threading.RLock()


def _now_iso() -> str:
    return datetime.datetime.utcnow().isoformat() + "Z"


def _read_registry() -> dict:
    """Load the JSON file. Returns the canonical empty shape on first
    run / parse error / permission issue — callers always see a valid
    dict."""
    try:
        with open(_REGISTRY_PATH, "r", encoding="utf-8") as f:
            data = json.load(f)
            if isinstance(data, dict) and isinstance(data.get("items"), list):
                return data
    except (FileNotFoundError, IsADirectoryError, PermissionError):
        pass
    except (OSError, json.JSONDecodeError) as e:
        print(f"[ProxMenux] managed_installs read failed ({e}); starting fresh")
    return {"version": _SCHEMA_VERSION, "items": []}


def _write_registry(reg: dict) -> bool:
    """Atomic write — tmp + rename. Never raises; returns False on any
    OS-level failure so the caller can decide whether to retry."""
    try:
        os.makedirs(_DB_DIR, exist_ok=True)
        tmp = _REGISTRY_PATH + ".tmp"
        with open(tmp, "w", encoding="utf-8") as f:
            json.dump(reg, f, indent=2, ensure_ascii=False)
            f.flush()
            os.fsync(f.fileno())
        os.replace(tmp, _REGISTRY_PATH)
        return True
    except OSError as e:
        print(f"[ProxMenux] managed_installs write failed: {e}")
        return False


# ─── Public read API ─────────────────────────────────────────────────────────

def get_registry() -> dict:
    """Return the full registry as a dict. Pure read — the caller can
    inspect ``items`` freely. Don't mutate the returned dict."""
    with _lock:
        return _read_registry()


def get_active_items() -> list[dict]:
    """Items the host actually has installed right now (no
    ``removed_at``). Most callers want this, not the full history."""
    with _lock:
        reg = _read_registry()
    return [it for it in reg.get("items", []) if not it.get("removed_at")]


def get_item(item_id: str) -> Optional[dict]:
    with _lock:
        reg = _read_registry()
    for it in reg.get("items", []):
        if it.get("id") == item_id:
            return it
    return None


# ─── DETECTORS — auto-discovery ──────────────────────────────────────────────
#
# Each detector is a `() -> Optional[dict]` that returns the *partial*
# entry shape (id, type, name, current_version, menu_label,
# menu_script — optional fields too) if the thing is installed on the
# host, or None if it's not. The framework merges this with the
# existing registry entry (preserving history) and rewrites if
# anything changed.


def _detect_nvidia_xfree86() -> Optional[dict]:
    """Detect a host-side NVIDIA driver via `nvidia-smi`."""
    try:
        proc = subprocess.run(
            [
                "nvidia-smi",
                "--query-gpu=driver_version",
                "--format=csv,noheader",
            ],
            capture_output=True, text=True, timeout=5,
        )
    except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
        return None
    if proc.returncode != 0:
        return None
    version = (proc.stdout or "").strip().splitlines()[0].strip() if proc.stdout else ""
    if not re.match(r"^\d+\.\d+(\.\d+)?$", version):
        return None
    return {
        "id": "nvidia-host",
        "type": "nvidia_xfree86",
        "name": "NVIDIA Host Driver",
        "current_version": version,
        "menu_label": "GPU & TPU → NVIDIA Driver",
        "menu_script": "scripts/gpu_tpu/nvidia_installer.sh",
    }


def _detect_oci_apps() -> list[dict]:
    """Bridge to the OCI manager so every OCI-installed app shows up
    in the registry without a per-app detector here. The OCI manager
    is the source of truth for OCI-specific state — we just project a
    subset into our registry shape."""
    try:
        import oci_manager
    except Exception:
        return []
    try:
        installed = oci_manager.list_installed_apps() or []
    except Exception as e:
        print(f"[ProxMenux] managed_installs OCI bridge failed: {e}")
        return []
    out: list[dict] = []
    for app in installed:
        app_id = app.get("app_id") or app.get("id")
        if not app_id:
            continue
        out.append({
            "id": f"oci:{app_id}",
            "type": "oci_app",
            "name": app.get("name") or app_id,
            "current_version": None,  # filled by checker
            "menu_label": "Settings → Secure Gateway",
            "menu_script": None,  # OCI apps update via the dashboard, no bash script
            # Stash the raw app_id so the checker can find it without
            # parsing the prefixed registry id.
            "_oci_app_id": app_id,
        })
    return out


# Detectors registered here. Each returns either a single entry dict
# or a list (for sources that yield multiple items, like OCI). The
# framework normalises both shapes.
_DETECTORS: list[Callable[[], Any]] = [
    _detect_nvidia_xfree86,
    _detect_oci_apps,
]


def _normalise_detector_result(result: Any) -> list[dict]:
    if not result:
        return []
    if isinstance(result, dict):
        return [result]
    if isinstance(result, list):
        return [r for r in result if isinstance(r, dict)]
    return []


def detect_and_register() -> dict:
    """Run every detector, merge results into the registry, persist.

    Behaviour per item:
      * detected + not in registry → add, ``installed_by="detected"``
      * detected + in registry as removed → reactivate (clear removed_at)
      * detected + already active → refresh ``current_version`` and any
        metadata that changed (e.g. menu_label evolved)
      * not detected + active in registry → mark ``removed_at``

    Returns the new registry.
    """
    discovered: dict[str, dict] = {}
    for detector in _DETECTORS:
        try:
            result = detector()
        except Exception as e:
            print(f"[ProxMenux] managed_installs detector {detector.__name__} failed: {e}")
            continue
        for entry in _normalise_detector_result(result):
            if not entry.get("id"):
                continue
            discovered[entry["id"]] = entry

    with _lock:
        reg = _read_registry()
        items: list[dict] = list(reg.get("items", []))
        index = {it.get("id"): i for i, it in enumerate(items) if it.get("id")}

        now = _now_iso()

        # 1. Add new + reactivate / refresh existing.
        for item_id, entry in discovered.items():
            if item_id in index:
                existing = items[index[item_id]]
                # Reactivate if it was previously removed
                if existing.get("removed_at"):
                    existing.pop("removed_at", None)
                    existing["reactivated_at"] = now
                # Refresh metadata fields that may have evolved
                for k in ("name", "current_version", "menu_label", "menu_script"):
                    if k in entry and entry[k] is not None:
                        existing[k] = entry[k]
                # Preserve internal helpers like `_oci_app_id`
                for k, v in entry.items():
                    if k.startswith("_"):
                        existing[k] = v
                existing["last_seen"] = now
            else:
                # Brand new entry
                new_entry = {
                    "id": entry["id"],
                    "type": entry.get("type", "unknown"),
                    "name": entry.get("name", entry["id"]),
                    "current_version": entry.get("current_version"),
                    "menu_label": entry.get("menu_label"),
                    "menu_script": entry.get("menu_script"),
                    "installed_by": "detected",
                    "first_seen": now,
                    "last_seen": now,
                    "update_check": {
                        "last_check": None,
                        "available": False,
                        "latest": None,
                        "error": None,
                    },
                }
                # Carry over internals (`_oci_app_id` etc.)
                for k, v in entry.items():
                    if k.startswith("_"):
                        new_entry[k] = v
                items.append(new_entry)

        # 2. Mark missing items as removed (don't delete — preserve
        #    history so a reinstall doesn't lose the audit trail).
        for it in items:
            if not it.get("id") or it.get("removed_at"):
                continue
            if it["id"] not in discovered:
                it["removed_at"] = now

        reg["items"] = items
        reg["version"] = _SCHEMA_VERSION
        reg["last_detect"] = now
        _write_registry(reg)
        return reg


# ─── CHECKERS — per-type update probes ───────────────────────────────────────
#
# A checker takes a registry entry and returns the *update* part of
# the registry shape:
#     {available, latest, last_check, error?}
# It must be idempotent and may use its own internal cache so we don't
# pay the upstream cost on every call.


def _check_oci_app(entry: dict) -> dict:
    """Delegate to oci_manager — already has its own 24h cache."""
    app_id = entry.get("_oci_app_id") or entry.get("id", "").removeprefix("oci:")
    if not app_id:
        return {"available": False, "latest": None, "last_check": _now_iso(),
                "error": "no app_id in registry entry"}
    try:
        import oci_manager
        state = oci_manager.check_app_update_available(app_id, force=False)
    except Exception as e:
        return {"available": False, "latest": None, "last_check": _now_iso(),
                "error": str(e)}
    if state.get("error"):
        return {"available": False, "latest": None, "last_check": _now_iso(),
                "error": state["error"]}
    return {
        "available": bool(state.get("available")),
        "latest": state.get("latest_version"),
        "current": state.get("current_version"),
        "last_check": state.get("last_checked_iso") or _now_iso(),
        "error": None,
        "_packages": state.get("packages") or [],
    }


# ── NVIDIA driver checker ──
#
# Source of truth for what's available upstream:
#   `https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt`
#       returns the single newest version, e.g. "580.105.08"
#   `https://download.nvidia.com/XFree86/Linux-x86_64/`
#       HTML directory listing — we scrape it for per-branch latest
#       (so a user on 570.x gets 570.x's latest, not pushed to 580.x
#       unless their kernel forces a branch upgrade).
#
# Cache TTL is 7 days because NVIDIA's release cadence on each branch
# is roughly monthly. The cache is in-memory only; AppImage restarts
# refresh it for free.

_NVIDIA_BASE = "https://download.nvidia.com/XFree86/Linux-x86_64"
_NVIDIA_CACHE_TTL = 7 * 86400
_nvidia_cache: dict[str, Any] = {"versions": [], "fetched_at": 0}


def _nvidia_kernel_compat() -> dict:
    """Python port of `get_kernel_compatibility_info` in the bash
    installer. Returns ``{kernel, min_version, recommended_branch,
    note}``. Kept identical to the bash matrix so the recommendation
    here matches what the installer would do."""
    try:
        kernel = subprocess.run(
            ["uname", "-r"], capture_output=True, text=True, timeout=2,
        ).stdout.strip()
    except (OSError, subprocess.TimeoutExpired):
        kernel = ""
    parts = kernel.split(".") if kernel else []
    try:
        major = int(parts[0]) if len(parts) >= 1 else 0
        minor = int(parts[1]) if len(parts) >= 2 else 0
    except (ValueError, TypeError):
        major, minor = 0, 0

    if major >= 7 or (major == 6 and minor >= 17):
        return {
            "kernel": kernel,
            "min_version": "580.105.08",
            "recommended_branch": "580",
            "note": (f"Kernel {kernel} requires NVIDIA driver 580.105.08 or "
                     f"newer (older 580.x builds fail to compile)"),
        }
    if major >= 6 and minor >= 8:
        return {"kernel": kernel, "min_version": "550",
                "recommended_branch": "580",
                "note": f"Kernel {kernel} works with NVIDIA driver 550.x or newer"}
    if major >= 6:
        return {"kernel": kernel, "min_version": "535",
                "recommended_branch": "550",
                "note": f"Kernel {kernel} works with NVIDIA driver 535.x or newer"}
    if major == 5 and minor >= 15:
        return {"kernel": kernel, "min_version": "470",
                "recommended_branch": "535",
                "note": f"Kernel {kernel} works with NVIDIA driver 470.x or newer"}
    return {"kernel": kernel, "min_version": "450",
            "recommended_branch": "470",
            "note": "For older kernels, compatibility may vary"}


def _version_tuple(v: str) -> tuple:
    """Convert ``580.105.08`` → ``(580, 105, 8)`` for comparison.
    Pads to 3 components so ``580.82`` < ``580.105.08``."""
    out = []
    for chunk in v.split("."):
        try:
            out.append(int(chunk))
        except (ValueError, TypeError):
            out.append(0)
    while len(out) < 3:
        out.append(0)
    return tuple(out[:3])


def _fetch_nvidia_versions(force: bool = False) -> list[str]:
    """Return the cached list of all upstream versions, or fetch fresh."""
    now = time.time()
    if not force and _nvidia_cache["versions"] and \
       now - _nvidia_cache["fetched_at"] < _NVIDIA_CACHE_TTL:
        return _nvidia_cache["versions"]
    try:
        req = urllib.request.Request(
            _NVIDIA_BASE + "/",
            headers={"User-Agent": "ProxMenux-Monitor/1.0"},
        )
        with urllib.request.urlopen(req, timeout=15) as resp:
            html = resp.read().decode("utf-8", errors="replace")
    except Exception as e:
        print(f"[ProxMenux] NVIDIA version fetch failed: {e}")
        return _nvidia_cache.get("versions", [])
    versions = sorted(
        {m.group(1) for m in re.finditer(
            r"""href=['"](\d+\.\d+(?:\.\d+)?)/?['"]""", html)},
        key=_version_tuple,
        reverse=True,
    )
    if versions:
        _nvidia_cache["versions"] = versions
        _nvidia_cache["fetched_at"] = now
    return versions


def _is_compat_with_kernel(version: str, kernel_compat: dict) -> bool:
    """Compare ``version`` (e.g. ``580.105.08``) against the kernel
    compatibility floor. Mirrors the bash ``is_version_compatible``
    helper (full-triple compare when min is dotted, major-only otherwise)."""
    min_str = kernel_compat.get("min_version", "0")
    if "." in min_str and re.match(r"^\d+\.\d+\.\d+$", min_str):
        return _version_tuple(version) >= _version_tuple(min_str)
    # Single-major threshold like "535" or "550"
    try:
        ver_major = int(version.split(".")[0])
        min_major = int(min_str)
    except (ValueError, TypeError):
        return True
    return ver_major >= min_major


def _check_nvidia_xfree86(entry: dict) -> dict:
    """Compute the update state for a host NVIDIA driver entry.

    Policy (Option C from the design discussion):
      1. Same-branch newer version available → notify.
      2. Current branch no longer compatible with current kernel →
         notify a branch upgrade with explicit messaging.
    """
    current = entry.get("current_version")
    if not current or not re.match(r"^\d+\.\d+(\.\d+)?$", current):
        return {"available": False, "latest": None,
                "last_check": _now_iso(), "error": "no installed version"}

    versions = _fetch_nvidia_versions()
    if not versions:
        return {"available": False, "latest": None,
                "last_check": _now_iso(),
                "error": "could not parse upstream version listing"}

    kernel_compat = _nvidia_kernel_compat()
    current_branch = current.split(".")[0]

    same_branch = [v for v in versions if v.split(".")[0] == current_branch
                   and _is_compat_with_kernel(v, kernel_compat)]
    same_branch_latest = same_branch[0] if same_branch else None

    notify_branch_upgrade = False
    branch_upgrade_target: Optional[str] = None
    if not _is_compat_with_kernel(current, kernel_compat):
        # Current branch / version no longer works with current kernel.
        # Recommend the kernel-recommended branch's latest.
        rec_branch = kernel_compat["recommended_branch"]
        rec_branch_versions = [v for v in versions
                                if v.split(".")[0] == rec_branch
                                and _is_compat_with_kernel(v, kernel_compat)]
        if rec_branch_versions:
            branch_upgrade_target = rec_branch_versions[0]
            notify_branch_upgrade = True

    available = False
    latest: Optional[str] = None
    upgrade_kind = None  # "patch" | "branch_upgrade" | None

    if notify_branch_upgrade and branch_upgrade_target:
        latest = branch_upgrade_target
        available = True
        upgrade_kind = "branch_upgrade"
    elif same_branch_latest and \
         _version_tuple(same_branch_latest) > _version_tuple(current):
        latest = same_branch_latest
        available = True
        upgrade_kind = "patch"

    return {
        "available": available,
        "latest": latest,
        "last_check": _now_iso(),
        "error": None,
        "_upgrade_kind": upgrade_kind,
        "_kernel": kernel_compat.get("kernel"),
        "_kernel_note": kernel_compat.get("note"),
    }


_CHECKERS: dict[str, Callable[[dict], dict]] = {
    "oci_app": _check_oci_app,
    "nvidia_xfree86": _check_nvidia_xfree86,
}


def check_for_updates(force: bool = False) -> list[dict]:
    """Run every type-specific checker over active items, persist
    the updated state, return the list of items that have an update
    available right now.

    The notification poller turns the returned list into events; the
    UI reads ``get_active_items()`` to render the inline "update
    available" line.

    ``force`` invalidates the per-source caches (currently only the
    NVIDIA versions list — OCI keeps its own internal cache).
    """
    if force:
        _nvidia_cache["versions"] = []
        _nvidia_cache["fetched_at"] = 0

    updates_available: list[dict] = []
    with _lock:
        reg = _read_registry()
        items = reg.get("items", [])
        for it in items:
            if it.get("removed_at"):
                continue
            checker = _CHECKERS.get(it.get("type"))
            if not checker:
                continue
            try:
                result = checker(it)
            except Exception as e:
                print(f"[ProxMenux] managed_installs checker failed for "
                      f"{it.get('id')}: {e}")
                result = {"available": False, "latest": None,
                          "last_check": _now_iso(), "error": str(e)}

            it["update_check"] = {
                "available": bool(result.get("available")),
                "latest": result.get("latest"),
                "last_check": result.get("last_check") or _now_iso(),
                "error": result.get("error"),
            }
            if result.get("current") and not it.get("current_version"):
                it["current_version"] = result["current"]
            for extra_key in ("_packages", "_upgrade_kind", "_kernel",
                              "_kernel_note"):
                if extra_key in result:
                    it["update_check"][extra_key] = result[extra_key]

            if it["update_check"]["available"]:
                updates_available.append(it)

        reg["items"] = items
        reg["last_check_run"] = _now_iso()
        _write_registry(reg)

    return updates_available