ProxMenux/AppImage/scripts/mount_monitor.py

"""Sprint 13: detect remote mount issues that PVE storage monitoring misses.

Parses ``/proc/mounts`` filtering NFS/CIFS/SMB entries, then for each
one runs a timeout-bounded ``stat`` to catch stale handles. Stale NFS
is the typical failure mode that broke a user's LXC: the mount looks
present in ``/proc/mounts`` but any access either blocks indefinitely
or returns ``ESTALE``. Meanwhile any app in the LXC that keeps writing
to that path appends to the underlying directory on the local
filesystem (because the mount is effectively gone), which silently
fills up the LXC's root disk and eventually kills the container.

This module sits next to ``proxmox_storage_monitor.py`` (which only
covers PVE-registered storages) and complements it for arbitrary
remote mounts done outside PVE (e.g. ``/etc/fstab`` entries, ad-hoc
``mount -t cifs``, etc.).

Scope for Sprint 13:
- Host-only. Mounts done inside running LXCs are out of scope —
  reaching them needs ``pct exec`` per container which is slow and
  can hang on a corrupted guest. That's tracked as a follow-up.
- Detects: stale (timeout/ESTALE), unexpected read-only, plain
  reachable.
"""

from __future__ import annotations

import os
import re
import subprocess
import threading
import time
from typing import Any

# `nfs`, `nfs4`, `cifs`, `smbfs`, `smb3`, etc. — any FS type whose name
# starts with one of the three remote families. Keeps the filter
# permissive without listing every variant.
_REMOTE_FS_RE = re.compile(r'^(nfs|cifs|smb)', re.IGNORECASE)

# Per-mount stat timeout. Configurable via env var so an admin running
# on a slow link can bump it without waiting for a code change. Default
# is 2 seconds — long enough that a healthy NFS over LAN responds, short
# enough that a stale mount doesn't block the health-check pipeline.
_STAT_TIMEOUT_SEC = int(os.environ.get('PROXMENUX_MOUNT_STAT_TIMEOUT', '2'))

# Top-level cache TTL: 60 s. Each scan is cheap (one stat per mount)
# but we don't want to re-stat on every API hit either, especially when
# the dashboard polls every 5 s.
_CACHE_TTL_SEC = 60

_cache_lock = threading.Lock()
_cache: dict[str, Any] = {
    'scanned_at': 0.0,
    'mounts': [],
}


def _read_proc_mounts() -> list[dict[str, Any]]:
    """Parse /proc/mounts and return only NFS/CIFS/SMB entries.

    Each entry: source, target, fstype, options (raw string), readonly.
    Anything that fails to parse is skipped silently — this is a
    monitor, not a validator, and a malformed line shouldn't crash the
    health pipeline.
    """
    out: list[dict[str, Any]] = []
    try:
        with open('/proc/mounts', 'r', encoding='utf-8', errors='replace') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) < 4:
                    continue
                source, target, fstype, options = parts[0], parts[1], parts[2], parts[3]
                if not _REMOTE_FS_RE.match(fstype):
                    continue
                opts_set = set(options.split(','))
                out.append({
                    'source': source,
                    'target': target,
                    'fstype': fstype,
                    'options': options,
                    'readonly': 'ro' in opts_set,
                })
    except OSError:
        pass
    return out


def _check_reachable(target: str, timeout: int = _STAT_TIMEOUT_SEC) -> dict[str, Any]:
    """Run ``stat`` against the mount target with a hard timeout.

    Returns ``{reachable: bool, error: str | None}``. We use the
    external ``stat`` binary rather than ``os.stat`` because the C
    syscall blocks the GIL when an NFS mount is stale, and a hung
    syscall would freeze the entire health monitor thread —
    subprocess gives us a real timeout we can enforce.
    """
    try:
        result = subprocess.run(
            ['stat', '-c', '%i', target],
            capture_output=True,
            text=True,
            timeout=timeout,
        )
        if result.returncode == 0:
            return {'reachable': True, 'error': None}
        err = (result.stderr or result.stdout).strip() or 'stat returned non-zero'
        return {'reachable': False, 'error': err}
    except subprocess.TimeoutExpired:
        return {
            'reachable': False,
            'error': f'stat timed out after {timeout}s (likely stale NFS handle)',
        }
    except OSError as e:
        return {'reachable': False, 'error': str(e)}


def _disk_usage(target: str, timeout: int = _STAT_TIMEOUT_SEC) -> dict[str, Any]:
    """Run ``df`` against the mount target with a hard timeout.

    Like ``_check_reachable``, we shell out so a stale NFS doesn't
    freeze the calling thread. Returns ``{total, used, available}`` in
    bytes when the call succeeds, ``None`` for each field when it
    times out or fails — the modal renders "n/a" in that case.
    """
    empty = {'total_bytes': None, 'used_bytes': None, 'available_bytes': None}
    try:
        result = subprocess.run(
            ['df', '-B1', '--output=size,used,avail', target],
            capture_output=True,
            text=True,
            timeout=timeout,
        )
        if result.returncode != 0:
            return empty
        # Output: header + 1 data line. Splitting on whitespace gives 3
        # ints when df succeeds.
        lines = [ln for ln in result.stdout.strip().splitlines() if ln.strip()]
        if len(lines) < 2:
            return empty
        parts = lines[-1].split()
        if len(parts) < 3:
            return empty
        try:
            return {
                'total_bytes': int(parts[0]),
                'used_bytes': int(parts[1]),
                'available_bytes': int(parts[2]),
            }
        except ValueError:
            return empty
    except (subprocess.TimeoutExpired, OSError):
        return empty


def _is_proxmox_managed(target: str) -> bool:
    """True when the mount target lives under ``/mnt/pve/``.

    PVE auto-mounts every NFS/CIFS storage at ``/mnt/pve/<storage_id>``
    and that directory is owned by ``pveproxy`` — no other tool uses
    it. So a target starting with that prefix is reliably a
    PVE-managed mount and the dashboard can flag it as such without
    paying a ``pvesh`` round-trip per mount.
    """
    return target.startswith('/mnt/pve/')


def scan_remote_mounts(force: bool = False) -> list[dict[str, Any]]:
    """Top-level scan: list each remote mount with its health status.

    Cached for ``_CACHE_TTL_SEC`` so back-to-back API hits don't all
    pay the stat cost. Pass ``force=True`` to bypass the cache (used
    by the health monitor to make sure each poll round sees fresh
    state).

    Each entry adds:
    - ``reachable``: bool
    - ``error``: str | None
    - ``status``: 'ok' | 'stale' | 'readonly'
        ``stale`` wins over ``readonly`` when both apply — a stale
        mount is a higher-severity issue.
    """
    now = time.time()
    if not force:
        with _cache_lock:
            if now - _cache.get('scanned_at', 0) < _CACHE_TTL_SEC:
                return list(_cache.get('mounts', []))

    raw = _read_proc_mounts()
    enriched: list[dict[str, Any]] = []
    for m in raw:
        health = _check_reachable(m['target'])
        entry = dict(m)
        entry['reachable'] = health['reachable']
        entry['error'] = health['error']
        entry['proxmox_managed'] = _is_proxmox_managed(m['target'])
        # df only when the mount is reachable — running df on a stale
        # mount blocks until the same timeout as stat, doubling the
        # delay for nothing useful.
        if health['reachable']:
            entry.update(_disk_usage(m['target']))
        else:
            entry.update({'total_bytes': None, 'used_bytes': None, 'available_bytes': None})
        if not health['reachable']:
            entry['status'] = 'stale'
        elif m['readonly']:
            entry['status'] = 'readonly'
        else:
            entry['status'] = 'ok'
        enriched.append(entry)

    with _cache_lock:
        _cache['scanned_at'] = now
        _cache['mounts'] = enriched
    return enriched


def get_unhealthy_mounts() -> list[dict[str, Any]]:
    """Convenience: only return mounts whose status is not ``ok``."""
    return [m for m in scan_remote_mounts() if m.get('status') != 'ok']


# ---------------------------------------------------------------------------
# LXC mount scanning (Sprint 13.24)
# ---------------------------------------------------------------------------
#
# The case the user reported was an NFS mount **inside** an LXC going stale:
# the host doesn't see the mount in its own /proc/mounts, so the host scan
# above misses it entirely. The container, meanwhile, keeps writing to the
# stale path which silently fills its rootfs.
#
# We list running LXCs via `pct list`, then peek into each one's
# /proc/self/mounts via `pct exec`. Both calls carry a hard timeout
# (`pct exec` blocks until forever on a corrupted CT) so the health
# monitor thread never freezes here.
#
# Stale detection runs from the host using `/proc/<pid>/root/<target>`
# rather than `pct exec stat`, which avoids spawning a second exec per
# mount and is also faster.

# Per-CT timeout. `pct exec` first contacts the container's pveproxy
# socket and then runs the command; 3s covers a healthy CT comfortably.
_LXC_EXEC_TIMEOUT_SEC = int(os.environ.get('PROXMENUX_LXC_EXEC_TIMEOUT', '3'))

_lxc_cache_lock = threading.Lock()
_lxc_cache: dict[str, Any] = {
    'scanned_at': 0.0,
    'mounts': [],
}


def _has_any_running_lxc() -> bool:
    """Cheap "is at least one CT running?" probe.

    Walks ``/proc`` looking for any process whose ``comm`` is
    ``lxc-start`` (the init shim that spawns CT pid 1). Bails on the
    first match. Costs ~1-5ms even on hosts with thousands of
    processes. Used as a short-circuit before the much more expensive
    `pct list` chain in `scan_lxc_mounts`.
    """
    try:
        for entry in os.scandir('/proc'):
            if not entry.name.isdigit():
                continue
            try:
                with open(f'/proc/{entry.name}/comm', 'r') as f:
                    if f.read().strip() == 'lxc-start':
                        return True
            except (OSError, IOError):
                continue
    except OSError:
        # If /proc is unreadable something is very wrong; let the
        # caller proceed with the full scan rather than silently
        # claiming no CTs run.
        return True
    return False


def _read_lxc_name(vmid: str) -> str:
    """Look up the CT hostname from /etc/pve/lxc/<vmid>.conf without
    invoking ``pct``. Returns '' if the file is unreadable."""
    for path in (f'/etc/pve/lxc/{vmid}.conf', f'/var/lib/lxc/{vmid}/config'):
        try:
            with open(path, 'r') as f:
                for line in f:
                    line = line.strip()
                    if line.startswith('hostname:'):
                        return line.split(':', 1)[1].strip()
                    if line.startswith('lxc.uts.name'):
                        # `lxc.uts.name = foo`
                        return line.split('=', 1)[1].strip()
        except (OSError, IOError):
            continue
    return ''


def _list_running_lxcs() -> list[dict[str, str]]:
    """Return ``[{vmid, name, pid}]`` for every running LXC.

    We need ``pid`` (the init process inside the CT, visible to the
    host) so we can stat the mount target via ``/proc/<pid>/root/...``
    without entering the container with another ``pct exec``.

    Implementation walks ``/proc`` for ``lxc-start -F -n <vmid>``
    processes — the userspace shim that supervises each running CT —
    and resolves the CT init pid via ``lxc-info -p`` (~2 ms) instead
    of the previous ``pct status --verbose`` chain (~500 ms per CT).
    On a 7-CT host this collapses ~7 seconds of subprocess churn into
    a single /proc walk plus seven 2 ms calls, dropping the full
    ``scan_lxc_mounts`` cost from ~8 s to <100 ms.
    """
    out: list[dict[str, str]] = []
    try:
        proc_entries = list(os.scandir('/proc'))
    except OSError:
        return out

    for entry in proc_entries:
        if not entry.name.isdigit():
            continue
        try:
            with open(f'/proc/{entry.name}/comm', 'r') as f:
                if f.read().strip() != 'lxc-start':
                    continue
            with open(f'/proc/{entry.name}/cmdline', 'rb') as f:
                cmdline = f.read().split(b'\x00')
        except (OSError, IOError):
            continue

        # cmdline like [b'/usr/bin/lxc-start', b'-F', b'-n', b'<vmid>', b'']
        vmid = ''
        try:
            idx = cmdline.index(b'-n')
            if idx + 1 < len(cmdline):
                vmid = cmdline[idx + 1].decode('utf-8', errors='replace').strip()
        except ValueError:
            continue
        if not vmid:
            continue

        pid = ''
        try:
            p2 = subprocess.run(
                ['lxc-info', '-n', vmid, '-p'],
                capture_output=True, text=True, timeout=2,
            )
            if p2.returncode == 0:
                for ln in p2.stdout.splitlines():
                    # lxc-info output: "PID: 12345"
                    if ln.strip().lower().startswith('pid:'):
                        pid = ln.split(':', 1)[1].strip()
                        break
        except (subprocess.TimeoutExpired, OSError):
            pass

        out.append({'vmid': vmid, 'name': _read_lxc_name(vmid), 'pid': pid})

    # Stable ordering by vmid for deterministic output.
    out.sort(key=lambda c: int(c['vmid']) if c['vmid'].isdigit() else 0)
    return out


def _read_lxc_mounts(ct: dict[str, str]) -> list[dict[str, Any]]:
    """Read remote FS mounts inside a running CT.

    Uses ``/proc/<host_pid>/mounts`` (the kernel exposes every running
    process's mount namespace there), so the host can read the CT's
    full mount table directly with no ``pct exec`` subprocess. Returns
    ``[]`` on any failure rather than raising — a single bad CT
    shouldn't break the scan of the rest.

    Accepts a ``ct`` dict (from `_list_running_lxcs`) instead of a
    bare vmid because we need the host PID, which is only available
    after the lxc-info lookup.
    """
    out: list[dict[str, Any]] = []
    pid = ct.get('pid')
    if not pid:
        return out
    try:
        with open(f'/proc/{pid}/mounts', 'r') as f:
            mount_lines = f.read().splitlines()
    except (OSError, IOError):
        return out
    for line in mount_lines:
        parts = line.split()
        if len(parts) < 4:
            continue
        source, target, fstype, options = parts[0], parts[1], parts[2], parts[3]
        if not _REMOTE_FS_RE.match(fstype):
            continue
        out.append({
            'source': source,
            'target': target,
            'fstype': fstype,
            'options': options,
            'readonly': 'ro' in set(options.split(',')),
        })
    return out


# Pseudo / virtual filesystems we never want to surface as a "mount
# nearing capacity" — these are kernel-managed and the numbers from
# statvfs are either nonsense (cgroup, sysfs) or change too fast to
# alert on (tmpfs).
_PSEUDO_FS = frozenset({
    'proc', 'sysfs', 'devpts', 'devtmpfs', 'tmpfs', 'mqueue', 'pstore',
    'cgroup', 'cgroup2', 'bpf', 'tracefs', 'debugfs', 'configfs',
    'securityfs', 'fuse.lxcfs', 'fusectl', 'autofs', 'binfmt_misc',
    'hugetlbfs', 'efivarfs', 'rpc_pipefs', 'nsfs', 'overlay',
})


def scan_lxc_mount_capacity(force: bool = False) -> list[dict[str, Any]]:
    """Capacity scan of mountpoints inside every running LXC.

    Sibling of `scan_lxc_mounts` — same /proc-walk and lxc-info pattern
    — but enumerates ALL real filesystems (not just NFS/CIFS/SMB) and
    returns capacity numbers via ``os.statvfs`` on the host-side
    namespace path ``/proc/<host_pid>/root/<target>``. Used by the
    Phase 3 ``_check_lxc_mount_capacity`` health check.

    Skips:
      - Pseudo-filesystems (proc, sysfs, tmpfs, cgroup, lxcfs, …) —
        their capacity numbers are kernel bookkeeping, not user data.
      - The CT rootfs (``/``) — already covered by ``_check_lxc_disk_usage``.
      - Mounts that fail statvfs (stale handle, perms): silently
        skipped so a hung NFS doesn't blow up the entire scan.

    Returns ``[{vmid, name, mount, fstype, total_bytes, used_bytes,
    available_bytes, usage_percent}, …]``. The 60s cache is shared
    with ``scan_lxc_mounts`` to avoid duplicate /proc walks; the LXC
    list is scanned once, the per-mount data is cheap (statvfs is
    a syscall, not subprocess) so we don't add a second cache layer.
    """
    if not force and not _has_any_running_lxc():
        return []

    out: list[dict[str, Any]] = []
    for ct in _list_running_lxcs():
        host_pid = ct.get('pid')
        vmid = ct.get('vmid')
        name = ct.get('name', '')
        if not host_pid or not vmid:
            continue
        try:
            with open(f'/proc/{host_pid}/mounts', 'r') as f:
                lines = f.read().splitlines()
        except (OSError, IOError):
            continue

        for line in lines:
            parts = line.split()
            if len(parts) < 4:
                continue
            source, target, fstype, options = parts[0], parts[1], parts[2], parts[3]

            # Skip pseudo-filesystems and the CT rootfs.
            if fstype in _PSEUDO_FS or fstype.startswith('fuse.'):
                continue
            if target == '/':
                continue

            # statvfs through the CT's mount namespace.
            host_path = f'/proc/{host_pid}/root{target}'
            try:
                st = os.statvfs(host_path)
            except (OSError, FileNotFoundError):
                continue
            if st.f_blocks == 0:
                continue  # zero-size mount (sometimes an empty cgroup)

            total = st.f_blocks * st.f_frsize
            available = st.f_bavail * st.f_frsize
            used = total - (st.f_bfree * st.f_frsize)
            pct = (used / total) * 100 if total > 0 else 0.0

            out.append({
                'vmid': vmid,
                'name': name,
                'mount': target,
                'source': source,
                'fstype': fstype,
                'readonly': 'ro' in set(options.split(',')),
                'total_bytes': total,
                'used_bytes': used,
                'available_bytes': available,
                'usage_percent': round(pct, 1),
            })
    return out


def _check_reachable_from_host(host_pid: str, ct_target: str,
                               timeout: int = _STAT_TIMEOUT_SEC) -> dict[str, Any]:
    """Stat a CT-internal path through ``/proc/<pid>/root``.

    The Linux kernel exposes every running process's mount namespace
    under ``/proc/<pid>/root``, so the host can reach the CT's view of
    a path without spawning a second ``pct exec``. Same timeout
    semantics as the host-side ``_check_reachable``.
    """
    if not host_pid:
        return {'reachable': False, 'error': 'CT pid unknown'}
    full_path = f'/proc/{host_pid}/root{ct_target}'
    try:
        result = subprocess.run(
            ['stat', '-c', '%i', full_path],
            capture_output=True, text=True, timeout=timeout,
        )
        if result.returncode == 0:
            return {'reachable': True, 'error': None}
        err = (result.stderr or result.stdout).strip() or 'stat returned non-zero'
        return {'reachable': False, 'error': err}
    except subprocess.TimeoutExpired:
        return {
            'reachable': False,
            'error': f'stat timed out after {timeout}s (likely stale handle inside CT)',
        }
    except OSError as e:
        return {'reachable': False, 'error': str(e)}


def scan_lxc_mounts(force: bool = False) -> list[dict[str, Any]]:
    """Top-level scan of remote mounts inside every running LXC.

    Cached for the same TTL as ``scan_remote_mounts``. Each entry
    follows the same shape as host mounts plus three CT-specific
    fields: ``lxc_id``, ``lxc_name``, ``lxc_pid``. ``proxmox_managed``
    is always ``False`` for LXC mounts (PVE doesn't manage mounts done
    inside containers).
    """
    now = time.time()
    if not force:
        with _lxc_cache_lock:
            if now - _lxc_cache.get('scanned_at', 0) < _CACHE_TTL_SEC:
                return list(_lxc_cache.get('mounts', []))

    # Cheap pre-check: skip the whole pct invocation chain when there
    # are no running CTs at all. `pct list` alone takes ~700ms on a
    # typical Proxmox host (perl startup + cluster file lock), so on
    # nodes that only run VMs (or none at all) this short-circuit was
    # accounting for ~0.23% of baseline CPU every 5 minutes for a result
    # that is always empty.
    #
    # Detection: walk /proc looking for any `lxc-start` process. This
    # is the actual init for a running CT. `/run/lxc/` always contains
    # `lock/` and `var/` admin dirs even with zero CTs, so it can't be
    # used as a count signal. /proc walk costs ~1-5ms and bails on the
    # first match.
    if not _has_any_running_lxc():
        with _lxc_cache_lock:
            _lxc_cache['scanned_at'] = now
            _lxc_cache['mounts'] = []
        return []

    enriched: list[dict[str, Any]] = []
    for ct in _list_running_lxcs():
        ct_mounts = _read_lxc_mounts(ct)
        for m in ct_mounts:
            health = _check_reachable_from_host(ct['pid'], m['target'])
            entry = dict(m)
            entry['lxc_id'] = ct['vmid']
            entry['lxc_name'] = ct['name']
            entry['lxc_pid'] = ct['pid']
            entry['proxmox_managed'] = False
            entry['reachable'] = health['reachable']
            entry['error'] = health['error']
            # Disk usage on a CT mount: needs running df *inside* the CT
            # (host's df can't traverse into /proc/<pid>/root/<target> for
            # non-bind-mounted FS). Skip for now — costs another pct exec
            # per mount and the dashboard's "Capacity" section would be
            # misleading for stale mounts anyway.
            entry['total_bytes'] = None
            entry['used_bytes'] = None
            entry['available_bytes'] = None
            if not health['reachable']:
                entry['status'] = 'stale'
            elif m['readonly']:
                entry['status'] = 'readonly'
            else:
                entry['status'] = 'ok'
            enriched.append(entry)

    with _lxc_cache_lock:
        _lxc_cache['scanned_at'] = now
        _lxc_cache['mounts'] = enriched
    return enriched