Files
ProxMenux/AppImage/scripts/mount_monitor.py
T
2026-05-09 18:59:59 +02:00

587 lines
22 KiB
Python

"""Sprint 13: detect remote mount issues that PVE storage monitoring misses.
Parses ``/proc/mounts`` filtering NFS/CIFS/SMB entries, then for each
one runs a timeout-bounded ``stat`` to catch stale handles. Stale NFS
is the typical failure mode that broke a user's LXC: the mount looks
present in ``/proc/mounts`` but any access either blocks indefinitely
or returns ``ESTALE``. Meanwhile any app in the LXC that keeps writing
to that path appends to the underlying directory on the local
filesystem (because the mount is effectively gone), which silently
fills up the LXC's root disk and eventually kills the container.
This module sits next to ``proxmox_storage_monitor.py`` (which only
covers PVE-registered storages) and complements it for arbitrary
remote mounts done outside PVE (e.g. ``/etc/fstab`` entries, ad-hoc
``mount -t cifs``, etc.).
Scope for Sprint 13:
- Host-only. Mounts done inside running LXCs are out of scope —
reaching them needs ``pct exec`` per container which is slow and
can hang on a corrupted guest. That's tracked as a follow-up.
- Detects: stale (timeout/ESTALE), unexpected read-only, plain
reachable.
"""
from __future__ import annotations
import os
import re
import subprocess
import threading
import time
from typing import Any
# `nfs`, `nfs4`, `cifs`, `smbfs`, `smb3`, etc. — any FS type whose name
# starts with one of the three remote families. Keeps the filter
# permissive without listing every variant.
_REMOTE_FS_RE = re.compile(r'^(nfs|cifs|smb)', re.IGNORECASE)
# Per-mount stat timeout. Configurable via env var so an admin running
# on a slow link can bump it without waiting for a code change. Default
# is 2 seconds — long enough that a healthy NFS over LAN responds, short
# enough that a stale mount doesn't block the health-check pipeline.
_STAT_TIMEOUT_SEC = int(os.environ.get('PROXMENUX_MOUNT_STAT_TIMEOUT', '2'))
# Top-level cache TTL: 60 s. Each scan is cheap (one stat per mount)
# but we don't want to re-stat on every API hit either, especially when
# the dashboard polls every 5 s.
_CACHE_TTL_SEC = 60
_cache_lock = threading.Lock()
_cache: dict[str, Any] = {
'scanned_at': 0.0,
'mounts': [],
}
def _read_proc_mounts() -> list[dict[str, Any]]:
"""Parse /proc/mounts and return only NFS/CIFS/SMB entries.
Each entry: source, target, fstype, options (raw string), readonly.
Anything that fails to parse is skipped silently — this is a
monitor, not a validator, and a malformed line shouldn't crash the
health pipeline.
"""
out: list[dict[str, Any]] = []
try:
with open('/proc/mounts', 'r', encoding='utf-8', errors='replace') as f:
for line in f:
parts = line.strip().split()
if len(parts) < 4:
continue
source, target, fstype, options = parts[0], parts[1], parts[2], parts[3]
if not _REMOTE_FS_RE.match(fstype):
continue
opts_set = set(options.split(','))
out.append({
'source': source,
'target': target,
'fstype': fstype,
'options': options,
'readonly': 'ro' in opts_set,
})
except OSError:
pass
return out
def _check_reachable(target: str, timeout: int = _STAT_TIMEOUT_SEC) -> dict[str, Any]:
"""Run ``stat`` against the mount target with a hard timeout.
Returns ``{reachable: bool, error: str | None}``. We use the
external ``stat`` binary rather than ``os.stat`` because the C
syscall blocks the GIL when an NFS mount is stale, and a hung
syscall would freeze the entire health monitor thread —
subprocess gives us a real timeout we can enforce.
"""
try:
result = subprocess.run(
['stat', '-c', '%i', target],
capture_output=True,
text=True,
timeout=timeout,
)
if result.returncode == 0:
return {'reachable': True, 'error': None}
err = (result.stderr or result.stdout).strip() or 'stat returned non-zero'
return {'reachable': False, 'error': err}
except subprocess.TimeoutExpired:
return {
'reachable': False,
'error': f'stat timed out after {timeout}s (likely stale NFS handle)',
}
except OSError as e:
return {'reachable': False, 'error': str(e)}
def _disk_usage(target: str, timeout: int = _STAT_TIMEOUT_SEC) -> dict[str, Any]:
"""Run ``df`` against the mount target with a hard timeout.
Like ``_check_reachable``, we shell out so a stale NFS doesn't
freeze the calling thread. Returns ``{total, used, available}`` in
bytes when the call succeeds, ``None`` for each field when it
times out or fails — the modal renders "n/a" in that case.
"""
empty = {'total_bytes': None, 'used_bytes': None, 'available_bytes': None}
try:
result = subprocess.run(
['df', '-B1', '--output=size,used,avail', target],
capture_output=True,
text=True,
timeout=timeout,
)
if result.returncode != 0:
return empty
# Output: header + 1 data line. Splitting on whitespace gives 3
# ints when df succeeds.
lines = [ln for ln in result.stdout.strip().splitlines() if ln.strip()]
if len(lines) < 2:
return empty
parts = lines[-1].split()
if len(parts) < 3:
return empty
try:
return {
'total_bytes': int(parts[0]),
'used_bytes': int(parts[1]),
'available_bytes': int(parts[2]),
}
except ValueError:
return empty
except (subprocess.TimeoutExpired, OSError):
return empty
def _is_proxmox_managed(target: str) -> bool:
"""True when the mount target lives under ``/mnt/pve/``.
PVE auto-mounts every NFS/CIFS storage at ``/mnt/pve/<storage_id>``
and that directory is owned by ``pveproxy`` — no other tool uses
it. So a target starting with that prefix is reliably a
PVE-managed mount and the dashboard can flag it as such without
paying a ``pvesh`` round-trip per mount.
"""
return target.startswith('/mnt/pve/')
def scan_remote_mounts(force: bool = False) -> list[dict[str, Any]]:
"""Top-level scan: list each remote mount with its health status.
Cached for ``_CACHE_TTL_SEC`` so back-to-back API hits don't all
pay the stat cost. Pass ``force=True`` to bypass the cache (used
by the health monitor to make sure each poll round sees fresh
state).
Each entry adds:
- ``reachable``: bool
- ``error``: str | None
- ``status``: 'ok' | 'stale' | 'readonly'
``stale`` wins over ``readonly`` when both apply — a stale
mount is a higher-severity issue.
"""
now = time.time()
if not force:
with _cache_lock:
if now - _cache.get('scanned_at', 0) < _CACHE_TTL_SEC:
return list(_cache.get('mounts', []))
raw = _read_proc_mounts()
enriched: list[dict[str, Any]] = []
for m in raw:
health = _check_reachable(m['target'])
entry = dict(m)
entry['reachable'] = health['reachable']
entry['error'] = health['error']
entry['proxmox_managed'] = _is_proxmox_managed(m['target'])
# df only when the mount is reachable — running df on a stale
# mount blocks until the same timeout as stat, doubling the
# delay for nothing useful.
if health['reachable']:
entry.update(_disk_usage(m['target']))
else:
entry.update({'total_bytes': None, 'used_bytes': None, 'available_bytes': None})
if not health['reachable']:
entry['status'] = 'stale'
elif m['readonly']:
entry['status'] = 'readonly'
else:
entry['status'] = 'ok'
enriched.append(entry)
with _cache_lock:
_cache['scanned_at'] = now
_cache['mounts'] = enriched
return enriched
def get_unhealthy_mounts() -> list[dict[str, Any]]:
"""Convenience: only return mounts whose status is not ``ok``."""
return [m for m in scan_remote_mounts() if m.get('status') != 'ok']
# ---------------------------------------------------------------------------
# LXC mount scanning (Sprint 13.24)
# ---------------------------------------------------------------------------
#
# The case the user reported was an NFS mount **inside** an LXC going stale:
# the host doesn't see the mount in its own /proc/mounts, so the host scan
# above misses it entirely. The container, meanwhile, keeps writing to the
# stale path which silently fills its rootfs.
#
# We list running LXCs via `pct list`, then peek into each one's
# /proc/self/mounts via `pct exec`. Both calls carry a hard timeout
# (`pct exec` blocks until forever on a corrupted CT) so the health
# monitor thread never freezes here.
#
# Stale detection runs from the host using `/proc/<pid>/root/<target>`
# rather than `pct exec stat`, which avoids spawning a second exec per
# mount and is also faster.
# Per-CT timeout. `pct exec` first contacts the container's pveproxy
# socket and then runs the command; 3s covers a healthy CT comfortably.
_LXC_EXEC_TIMEOUT_SEC = int(os.environ.get('PROXMENUX_LXC_EXEC_TIMEOUT', '3'))
_lxc_cache_lock = threading.Lock()
_lxc_cache: dict[str, Any] = {
'scanned_at': 0.0,
'mounts': [],
}
def _has_any_running_lxc() -> bool:
"""Cheap "is at least one CT running?" probe.
Walks ``/proc`` looking for any process whose ``comm`` is
``lxc-start`` (the init shim that spawns CT pid 1). Bails on the
first match. Costs ~1-5ms even on hosts with thousands of
processes. Used as a short-circuit before the much more expensive
`pct list` chain in `scan_lxc_mounts`.
"""
try:
for entry in os.scandir('/proc'):
if not entry.name.isdigit():
continue
try:
with open(f'/proc/{entry.name}/comm', 'r') as f:
if f.read().strip() == 'lxc-start':
return True
except (OSError, IOError):
continue
except OSError:
# If /proc is unreadable something is very wrong; let the
# caller proceed with the full scan rather than silently
# claiming no CTs run.
return True
return False
def _read_lxc_name(vmid: str) -> str:
"""Look up the CT hostname from /etc/pve/lxc/<vmid>.conf without
invoking ``pct``. Returns '' if the file is unreadable."""
for path in (f'/etc/pve/lxc/{vmid}.conf', f'/var/lib/lxc/{vmid}/config'):
try:
with open(path, 'r') as f:
for line in f:
line = line.strip()
if line.startswith('hostname:'):
return line.split(':', 1)[1].strip()
if line.startswith('lxc.uts.name'):
# `lxc.uts.name = foo`
return line.split('=', 1)[1].strip()
except (OSError, IOError):
continue
return ''
def _list_running_lxcs() -> list[dict[str, str]]:
"""Return ``[{vmid, name, pid}]`` for every running LXC.
We need ``pid`` (the init process inside the CT, visible to the
host) so we can stat the mount target via ``/proc/<pid>/root/...``
without entering the container with another ``pct exec``.
Implementation walks ``/proc`` for ``lxc-start -F -n <vmid>``
processes — the userspace shim that supervises each running CT —
and resolves the CT init pid via ``lxc-info -p`` (~2 ms) instead
of the previous ``pct status --verbose`` chain (~500 ms per CT).
On a 7-CT host this collapses ~7 seconds of subprocess churn into
a single /proc walk plus seven 2 ms calls, dropping the full
``scan_lxc_mounts`` cost from ~8 s to <100 ms.
"""
out: list[dict[str, str]] = []
try:
proc_entries = list(os.scandir('/proc'))
except OSError:
return out
for entry in proc_entries:
if not entry.name.isdigit():
continue
try:
with open(f'/proc/{entry.name}/comm', 'r') as f:
if f.read().strip() != 'lxc-start':
continue
with open(f'/proc/{entry.name}/cmdline', 'rb') as f:
cmdline = f.read().split(b'\x00')
except (OSError, IOError):
continue
# cmdline like [b'/usr/bin/lxc-start', b'-F', b'-n', b'<vmid>', b'']
vmid = ''
try:
idx = cmdline.index(b'-n')
if idx + 1 < len(cmdline):
vmid = cmdline[idx + 1].decode('utf-8', errors='replace').strip()
except ValueError:
continue
if not vmid:
continue
pid = ''
try:
p2 = subprocess.run(
['lxc-info', '-n', vmid, '-p'],
capture_output=True, text=True, timeout=2,
)
if p2.returncode == 0:
for ln in p2.stdout.splitlines():
# lxc-info output: "PID: 12345"
if ln.strip().lower().startswith('pid:'):
pid = ln.split(':', 1)[1].strip()
break
except (subprocess.TimeoutExpired, OSError):
pass
out.append({'vmid': vmid, 'name': _read_lxc_name(vmid), 'pid': pid})
# Stable ordering by vmid for deterministic output.
out.sort(key=lambda c: int(c['vmid']) if c['vmid'].isdigit() else 0)
return out
def _read_lxc_mounts(ct: dict[str, str]) -> list[dict[str, Any]]:
"""Read remote FS mounts inside a running CT.
Uses ``/proc/<host_pid>/mounts`` (the kernel exposes every running
process's mount namespace there), so the host can read the CT's
full mount table directly with no ``pct exec`` subprocess. Returns
``[]`` on any failure rather than raising — a single bad CT
shouldn't break the scan of the rest.
Accepts a ``ct`` dict (from `_list_running_lxcs`) instead of a
bare vmid because we need the host PID, which is only available
after the lxc-info lookup.
"""
out: list[dict[str, Any]] = []
pid = ct.get('pid')
if not pid:
return out
try:
with open(f'/proc/{pid}/mounts', 'r') as f:
mount_lines = f.read().splitlines()
except (OSError, IOError):
return out
for line in mount_lines:
parts = line.split()
if len(parts) < 4:
continue
source, target, fstype, options = parts[0], parts[1], parts[2], parts[3]
if not _REMOTE_FS_RE.match(fstype):
continue
out.append({
'source': source,
'target': target,
'fstype': fstype,
'options': options,
'readonly': 'ro' in set(options.split(',')),
})
return out
# Pseudo / virtual filesystems we never want to surface as a "mount
# nearing capacity" — these are kernel-managed and the numbers from
# statvfs are either nonsense (cgroup, sysfs) or change too fast to
# alert on (tmpfs).
_PSEUDO_FS = frozenset({
'proc', 'sysfs', 'devpts', 'devtmpfs', 'tmpfs', 'mqueue', 'pstore',
'cgroup', 'cgroup2', 'bpf', 'tracefs', 'debugfs', 'configfs',
'securityfs', 'fuse.lxcfs', 'fusectl', 'autofs', 'binfmt_misc',
'hugetlbfs', 'efivarfs', 'rpc_pipefs', 'nsfs', 'overlay',
})
def scan_lxc_mount_capacity(force: bool = False) -> list[dict[str, Any]]:
"""Capacity scan of mountpoints inside every running LXC.
Sibling of `scan_lxc_mounts` — same /proc-walk and lxc-info pattern
— but enumerates ALL real filesystems (not just NFS/CIFS/SMB) and
returns capacity numbers via ``os.statvfs`` on the host-side
namespace path ``/proc/<host_pid>/root/<target>``. Used by the
Phase 3 ``_check_lxc_mount_capacity`` health check.
Skips:
- Pseudo-filesystems (proc, sysfs, tmpfs, cgroup, lxcfs, …) —
their capacity numbers are kernel bookkeeping, not user data.
- The CT rootfs (``/``) — already covered by ``_check_lxc_disk_usage``.
- Mounts that fail statvfs (stale handle, perms): silently
skipped so a hung NFS doesn't blow up the entire scan.
Returns ``[{vmid, name, mount, fstype, total_bytes, used_bytes,
available_bytes, usage_percent}, …]``. The 60s cache is shared
with ``scan_lxc_mounts`` to avoid duplicate /proc walks; the LXC
list is scanned once, the per-mount data is cheap (statvfs is
a syscall, not subprocess) so we don't add a second cache layer.
"""
if not force and not _has_any_running_lxc():
return []
out: list[dict[str, Any]] = []
for ct in _list_running_lxcs():
host_pid = ct.get('pid')
vmid = ct.get('vmid')
name = ct.get('name', '')
if not host_pid or not vmid:
continue
try:
with open(f'/proc/{host_pid}/mounts', 'r') as f:
lines = f.read().splitlines()
except (OSError, IOError):
continue
for line in lines:
parts = line.split()
if len(parts) < 4:
continue
source, target, fstype, options = parts[0], parts[1], parts[2], parts[3]
# Skip pseudo-filesystems and the CT rootfs.
if fstype in _PSEUDO_FS or fstype.startswith('fuse.'):
continue
if target == '/':
continue
# statvfs through the CT's mount namespace.
host_path = f'/proc/{host_pid}/root{target}'
try:
st = os.statvfs(host_path)
except (OSError, FileNotFoundError):
continue
if st.f_blocks == 0:
continue # zero-size mount (sometimes an empty cgroup)
total = st.f_blocks * st.f_frsize
available = st.f_bavail * st.f_frsize
used = total - (st.f_bfree * st.f_frsize)
pct = (used / total) * 100 if total > 0 else 0.0
out.append({
'vmid': vmid,
'name': name,
'mount': target,
'source': source,
'fstype': fstype,
'readonly': 'ro' in set(options.split(',')),
'total_bytes': total,
'used_bytes': used,
'available_bytes': available,
'usage_percent': round(pct, 1),
})
return out
def _check_reachable_from_host(host_pid: str, ct_target: str,
timeout: int = _STAT_TIMEOUT_SEC) -> dict[str, Any]:
"""Stat a CT-internal path through ``/proc/<pid>/root``.
The Linux kernel exposes every running process's mount namespace
under ``/proc/<pid>/root``, so the host can reach the CT's view of
a path without spawning a second ``pct exec``. Same timeout
semantics as the host-side ``_check_reachable``.
"""
if not host_pid:
return {'reachable': False, 'error': 'CT pid unknown'}
full_path = f'/proc/{host_pid}/root{ct_target}'
try:
result = subprocess.run(
['stat', '-c', '%i', full_path],
capture_output=True, text=True, timeout=timeout,
)
if result.returncode == 0:
return {'reachable': True, 'error': None}
err = (result.stderr or result.stdout).strip() or 'stat returned non-zero'
return {'reachable': False, 'error': err}
except subprocess.TimeoutExpired:
return {
'reachable': False,
'error': f'stat timed out after {timeout}s (likely stale handle inside CT)',
}
except OSError as e:
return {'reachable': False, 'error': str(e)}
def scan_lxc_mounts(force: bool = False) -> list[dict[str, Any]]:
"""Top-level scan of remote mounts inside every running LXC.
Cached for the same TTL as ``scan_remote_mounts``. Each entry
follows the same shape as host mounts plus three CT-specific
fields: ``lxc_id``, ``lxc_name``, ``lxc_pid``. ``proxmox_managed``
is always ``False`` for LXC mounts (PVE doesn't manage mounts done
inside containers).
"""
now = time.time()
if not force:
with _lxc_cache_lock:
if now - _lxc_cache.get('scanned_at', 0) < _CACHE_TTL_SEC:
return list(_lxc_cache.get('mounts', []))
# Cheap pre-check: skip the whole pct invocation chain when there
# are no running CTs at all. `pct list` alone takes ~700ms on a
# typical Proxmox host (perl startup + cluster file lock), so on
# nodes that only run VMs (or none at all) this short-circuit was
# accounting for ~0.23% of baseline CPU every 5 minutes for a result
# that is always empty.
#
# Detection: walk /proc looking for any `lxc-start` process. This
# is the actual init for a running CT. `/run/lxc/` always contains
# `lock/` and `var/` admin dirs even with zero CTs, so it can't be
# used as a count signal. /proc walk costs ~1-5ms and bails on the
# first match.
if not _has_any_running_lxc():
with _lxc_cache_lock:
_lxc_cache['scanned_at'] = now
_lxc_cache['mounts'] = []
return []
enriched: list[dict[str, Any]] = []
for ct in _list_running_lxcs():
ct_mounts = _read_lxc_mounts(ct)
for m in ct_mounts:
health = _check_reachable_from_host(ct['pid'], m['target'])
entry = dict(m)
entry['lxc_id'] = ct['vmid']
entry['lxc_name'] = ct['name']
entry['lxc_pid'] = ct['pid']
entry['proxmox_managed'] = False
entry['reachable'] = health['reachable']
entry['error'] = health['error']
# Disk usage on a CT mount: needs running df *inside* the CT
# (host's df can't traverse into /proc/<pid>/root/<target> for
# non-bind-mounted FS). Skip for now — costs another pct exec
# per mount and the dashboard's "Capacity" section would be
# misleading for stale mounts anyway.
entry['total_bytes'] = None
entry['used_bytes'] = None
entry['available_bytes'] = None
if not health['reachable']:
entry['status'] = 'stale'
elif m['readonly']:
entry['status'] = 'readonly'
else:
entry['status'] = 'ok'
enriched.append(entry)
with _lxc_cache_lock:
_lxc_cache['scanned_at'] = now
_lxc_cache['mounts'] = enriched
return enriched