diff --git a/AppImage/ProxMenux-1.2.1.1-beta.AppImage b/AppImage/ProxMenux-1.2.1.1-beta.AppImage index c78f5655..c8240187 100755 Binary files a/AppImage/ProxMenux-1.2.1.1-beta.AppImage and b/AppImage/ProxMenux-1.2.1.1-beta.AppImage differ diff --git a/AppImage/ProxMenux-Monitor.AppImage.sha256 b/AppImage/ProxMenux-Monitor.AppImage.sha256 index be31185b..c10b8619 100644 --- a/AppImage/ProxMenux-Monitor.AppImage.sha256 +++ b/AppImage/ProxMenux-Monitor.AppImage.sha256 @@ -1 +1 @@ -6249ae8d51e0d7dbd3035ba49f4244ff035c2c6d97d5c55f69ab0dac6a4ea021 ProxMenux-1.2.1.1-beta.AppImage +70a510025df81652319d16e0d36e77bea95a965163608232e9aca60ada9c9fbf ProxMenux-1.2.1.1-beta.AppImage diff --git a/AppImage/components/virtual-machines.tsx b/AppImage/components/virtual-machines.tsx index 8ddf1536..2df045c9 100644 --- a/AppImage/components/virtual-machines.tsx +++ b/AppImage/components/virtual-machines.tsx @@ -170,6 +170,12 @@ interface LxcMountPoint { runtime_readonly?: boolean runtime_reachable?: boolean runtime_error?: string | null + // Sprint 14.x: host-side bind source state. Detects the case where the + // CT still reports a bind as mounted even though the host already + // umounted the source (Ignacio Seijo 11/05). Null = N/A (PVE volume, + // not a host path). + host_source_exists?: boolean | null + host_source_is_mountpoint?: boolean | null } const fetcher = async (url: string) => { @@ -321,9 +327,18 @@ function MountPointCard({ mp }: { mp: LxcMountPoint }) { const isStale = mp.runtime_reachable === false const isReadonly = !isStale && mp.runtime_readonly === true const isDivergent = mp.runtime_mounted === false // configured but not actually mounted + // "Zombie bind": the host removed the source (e.g. USB pulled, manual + // umount) but the CT mount namespace still shows the bind as mounted. + // Reported by Ignacio Seijo (11/05). Only flag host_bind / + // pve_storage_bind sources — PVE volume sources have no host path + // and `host_source_exists` comes back null for them. + const isHostDetached = + mp.runtime_mounted === true && + (mp.type === "host_bind" || mp.type === "pve_storage_bind") && + mp.host_source_exists === false const cardClasses = isStale ? "border-red-500/50 bg-red-500/5" - : isDivergent + : isDivergent || isHostDetached ? "border-amber-500/40 bg-amber-500/5" : isReadonly ? "border-amber-500/30 bg-amber-500/5" @@ -395,7 +410,7 @@ function MountPointCard({ mp }: { mp: LxcMountPoint }) { className={ isStale ? "bg-red-500/10 text-red-500 border-red-500/20" - : isDivergent + : isDivergent || isHostDetached ? "bg-amber-500/10 text-amber-500 border-amber-500/20" : isReadonly ? "bg-amber-500/10 text-amber-500 border-amber-500/20" @@ -408,11 +423,13 @@ function MountPointCard({ mp }: { mp: LxcMountPoint }) { ? "stale" : isDivergent ? "not mounted" - : isReadonly - ? "read-only" - : mp.runtime_mounted === null - ? "stopped" - : "mounted"} + : isHostDetached + ? "host detached" + : isReadonly + ? "read-only" + : mp.runtime_mounted === null + ? "stopped" + : "mounted"} diff --git a/AppImage/scripts/flask_notification_routes.py b/AppImage/scripts/flask_notification_routes.py index 58072281..e80379e6 100644 --- a/AppImage/scripts/flask_notification_routes.py +++ b/AppImage/scripts/flask_notification_routes.py @@ -191,6 +191,24 @@ def _bad_request(msg: str): return jsonify({'error': msg}), 400 +def _is_loopback_addr(value: str) -> bool: + """Return True for IPv4, IPv6 and IPv4-mapped loopback addresses. + + When Flask is bound to ``::`` for dual-stack support, an HTTP request + sent to ``127.0.0.1`` can be reported as ``::ffff:127.0.0.1``. Treat it + as local so the PVE webhook keeps the intended localhost trust path. + """ + try: + import ipaddress + addr = ipaddress.ip_address(value) + if addr.is_loopback: + return True + ipv4_mapped = getattr(addr, 'ipv4_mapped', None) + return bool(ipv4_mapped and ipv4_mapped.is_loopback) + except ValueError: + return value == 'localhost' + + def _validate_event_type(value: str) -> bool: return isinstance(value, str) and bool(_EVENT_TYPE_RE.match(value)) @@ -983,9 +1001,15 @@ def setup_pve_webhook_core() -> dict: # endpoint depends entirely on the localhost-bypass and any move # to a non-loopback bind silently breaks auth. Audit Tier 3.1 — # `setup_pve_webhook_core` no escribe secret en priv cfg. + # + # PVE stores `secret value=` in STANDARD base64 and decodes it + # before emitting the header. Writing the raw token here triggered + # `could not decode UTF8 string from base64, key 'X-Webhook-Secret' (500)` + # whenever `token_urlsafe` produced `-` or `_` chars (GH #198). + secret_b64 = base64.b64encode(secret.encode()).decode() priv_block = ( f"webhook: {_PVE_ENDPOINT_ID}\n" - f" secret name=X-Webhook-Secret,value={secret}\n" + f" secret name=X-Webhook-Secret,value={secret_b64}\n" ) if priv_text is not None: @@ -1225,7 +1249,7 @@ def proxmox_webhook(): _reject = lambda code, error, status: (jsonify({'accepted': False, 'error': error}), status) client_ip = request.remote_addr or '' - is_localhost = client_ip in ('127.0.0.1', '::1') + is_localhost = _is_loopback_addr(client_ip) # CSRF defence-in-depth: reject `application/x-www-form-urlencoded` # bodies. PVE always sends `application/json`; form-encoded bodies diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index abc96703..5e1413f2 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -4197,22 +4197,37 @@ class HealthMonitor: """ cache_key = 'updates_check' current_time = time.time() - - # Cache for 10 minutes - if cache_key in self.last_check_times: - if current_time - self.last_check_times[cache_key] < 600: - return self.cached_results.get(cache_key) - + apt_history_path = '/var/log/apt/history.log' + + # Detect a manual `apt install/upgrade` since the last check by + # comparing /var/log/apt/history.log's mtime against the cache + # timestamp. apt appends to this file on every transaction, so a + # newer mtime means the local package state changed and the cached + # pending-updates list is stale. Reported by Alberto (14/5): the + # dashboard tile kept showing pending updates ~hours after he ran + # `apt upgrade` manually. Cheap stat call; runs at most once per + # /api/health/full request. + history_mtime = None + try: + if os.path.exists(apt_history_path): + history_mtime = os.path.getmtime(apt_history_path) + except Exception: + history_mtime = None + + if cache_key in self.last_check_times: + cache_ts = self.last_check_times[cache_key] + history_changed = (history_mtime is not None and history_mtime > cache_ts) + if not history_changed and current_time - cache_ts < 600: + return self.cached_results.get(cache_key) + try: - apt_history_path = '/var/log/apt/history.log' last_update_days = None sec_result = None age_result = None - - if os.path.exists(apt_history_path): + + if history_mtime is not None: try: - mtime = os.path.getmtime(apt_history_path) - days_since_update = (current_time - mtime) / 86400 + days_since_update = (current_time - history_mtime) / 86400 last_update_days = int(days_since_update) except Exception: pass @@ -5775,12 +5790,24 @@ class HealthMonitor: 'used_bytes': used, } error_key = f'pve_storage_full_{name}' + # If the user already dismissed this exact error (within the + # suppression window), don't count it toward the category + # severity badge. Without this guard the storage section stayed + # WARNING/CRITICAL forever even after dismiss because the + # underlying % is unchanged — `record_error` correctly returned + # `skipped_acknowledged` to silence the notification side, but + # the dashboard counter ignored that signal and the user saw + # "Storage: 1 Warning" with no way to clear it. Reported on + # the community channel re: PBS-lleno (17-18/05). + is_dismissed = health_persistence.is_error_acknowledged(error_key) if pct >= crit_pct: - entry['status'] = 'CRITICAL' + entry['status'] = 'CRITICAL' if not is_dismissed else 'INFO' entry['error_key'] = error_key entry['dismissable'] = True + entry['dismissed'] = is_dismissed checks[label] = entry - critical_labels.append(label) + if not is_dismissed: + critical_labels.append(label) emitted_keys.add(error_key) health_persistence.record_error( error_key=error_key, @@ -5790,11 +5817,13 @@ class HealthMonitor: details=entry, ) elif pct >= warn_pct: - entry['status'] = 'WARNING' + entry['status'] = 'WARNING' if not is_dismissed else 'INFO' entry['error_key'] = error_key entry['dismissable'] = True + entry['dismissed'] = is_dismissed checks[label] = entry - warning_labels.append(label) + if not is_dismissed: + warning_labels.append(label) emitted_keys.add(error_key) health_persistence.record_error( error_key=error_key, @@ -5877,12 +5906,18 @@ class HealthMonitor: 'pool_name': name, } error_key = f'zfs_pool_full_{name}' + # Same dismiss-respect as `_check_pve_storage_capacity`. A pool + # that the user dismissed keeps its underlying % but should no + # longer flip the category badge to WARNING/CRITICAL. + is_dismissed = health_persistence.is_error_acknowledged(error_key) if pct >= crit_pct: - entry['status'] = 'CRITICAL' + entry['status'] = 'CRITICAL' if not is_dismissed else 'INFO' entry['error_key'] = error_key entry['dismissable'] = True + entry['dismissed'] = is_dismissed checks[name] = entry - critical_labels.append(name) + if not is_dismissed: + critical_labels.append(name) emitted_keys.add(error_key) health_persistence.record_error( error_key=error_key, @@ -5892,11 +5927,13 @@ class HealthMonitor: details=entry, ) elif pct >= warn_pct: - entry['status'] = 'WARNING' + entry['status'] = 'WARNING' if not is_dismissed else 'INFO' entry['error_key'] = error_key entry['dismissable'] = True + entry['dismissed'] = is_dismissed checks[name] = entry - warning_labels.append(name) + if not is_dismissed: + warning_labels.append(name) emitted_keys.add(error_key) health_persistence.record_error( error_key=error_key, diff --git a/AppImage/scripts/lxc_mount_points.py b/AppImage/scripts/lxc_mount_points.py index 9ec7467d..88f55d38 100644 --- a/AppImage/scripts/lxc_mount_points.py +++ b/AppImage/scripts/lxc_mount_points.py @@ -231,17 +231,134 @@ def _df_path(path: str) -> dict[str, Optional[int]]: return empty +_SIZE_UNIT_TO_BYTES = { + "": 1, "B": 1, + "K": 1024, "KB": 1024, "KIB": 1024, + "M": 1024 ** 2, "MB": 1024 ** 2, "MIB": 1024 ** 2, + "G": 1024 ** 3, "GB": 1024 ** 3, "GIB": 1024 ** 3, + "T": 1024 ** 4, "TB": 1024 ** 4, "TIB": 1024 ** 4, +} + + +def _parse_pve_size(value: str) -> Optional[int]: + """Convert PVE-style sizes (``150G``, ``32M``, ``2T``) to bytes. + + PVE stores volume sizes in lxc.conf as ``size=`` where + unit is a single letter from {K,M,G,T} (powers of 1024). Returns + None for empty/unparseable input — callers fall through to + pvesm-based totals. + """ + if value is None: + return None + s = str(value).strip().upper() + if not s: + return None + m = re.match(r"^(\d+(?:\.\d+)?)\s*([KMGT]?I?B?)$", s) + if not m: + return None + try: + magnitude = float(m.group(1)) + except ValueError: + return None + unit = m.group(2) or "" + multiplier = _SIZE_UNIT_TO_BYTES.get(unit) + if multiplier is None: + return None + return int(magnitude * multiplier) + + +def _df_via_host_pid(host_pid: str, ct_target: str) -> dict[str, Optional[int]]: + """``df`` the CT-internal path via ``/proc//root`` so we get + the filesystem as the container sees it, including ZFS dataset + quotas. Used for ``pve_volume`` mounts whose ``pvesm status`` + numbers reflect the whole storage pool instead of the per-subvol + quota — without this the UI showed 851 GB total for a 150 GB ZFS + subvol because pvesm reports the rpool's free space. + """ + empty = {"total_bytes": None, "used_bytes": None, "available_bytes": None} + if not host_pid or not ct_target: + return empty + full = f"/proc/{host_pid}/root{ct_target}" + try: + proc = subprocess.run( + ["df", "-B1", "--output=size,used,avail", full], + capture_output=True, text=True, timeout=_STAT_TIMEOUT, + ) + if proc.returncode != 0: + return empty + lines = [ln for ln in proc.stdout.strip().splitlines() if ln.strip()] + if len(lines) < 2: + return empty + parts = lines[-1].split() + if len(parts) < 3: + return empty + return { + "total_bytes": int(parts[0]), + "used_bytes": int(parts[1]), + "available_bytes": int(parts[2]), + } + except (subprocess.TimeoutExpired, OSError, ValueError): + return empty + + def _capacity_for(source: str, classification: dict[str, Any], - pve_storages: dict[str, dict[str, Any]]) -> dict[str, Optional[int]]: + pve_storages: dict[str, dict[str, Any]], + config_options: Optional[dict[str, Any]] = None, + host_pid: str = "", + target: str = "") -> dict[str, Optional[int]]: """Return total/used/available bytes for the *source* of a mount. - ``pve_volume`` and ``pve_storage_bind`` reuse the numbers from - ``pvesm status`` (already loaded once). ``host_bind`` falls back to - ``df`` of the host path. None values mean the lookup didn't - succeed and the UI will render n/a. + ``pve_volume`` quota handling (Sprint 14.x — Ignacio Seijo 10/05): + A ``mp6: local-zfs:subvol-310-disk-1,size=150G,...`` line carved + out a 150 GB subvol from a 1 TB pool. The previous code read + ``pvesm status local-zfs`` and reported 851 GB total / 19% used — + reflecting the whole pool, not the subvol. We now prefer, in + order: + 1) ``df`` of ``/proc//root/`` when the CT is + up — gives the correct view-from-inside numbers including + the quota. + 2) ``size=`` from lxc.conf as the total; usage is unknown + when the CT isn't running, so the UI shows total only. + 3) Fallback to ``pvesm status`` (pool numbers) when the entry + has no declared size — that's the legacy behaviour for + sizeless block volumes (lvm raw, rbd). + + ``pve_storage_bind`` mounts (NFS, CIFS at ``/mnt/pve/...``) keep + the pvesm-based numbers because the storage IS the source of truth + for those. + + ``host_bind`` falls back to ``df`` of the host path. None values + mean the lookup didn't succeed and the UI will render n/a. """ ctype = classification.get("type") - if ctype in ("pve_volume", "pve_storage_bind"): + config_options = config_options or {} + declared_size_bytes = _parse_pve_size(config_options.get("size")) + + if ctype == "pve_volume": + # 1) Live numbers from inside the CT (respects quota). + if host_pid and target: + live = _df_via_host_pid(host_pid, target) + if live.get("total_bytes") is not None: + return live + # 2) CT down (or df failed): expose declared quota as total. + if declared_size_bytes is not None: + return { + "total_bytes": declared_size_bytes, + "used_bytes": None, + "available_bytes": None, + } + # 3) No quota declared: legacy pool-level numbers. + sid = classification.get("origin_storage", "") + st = pve_storages.get(sid) + if not st: + return {"total_bytes": None, "used_bytes": None, "available_bytes": None} + return { + "total_bytes": st["total_kib"] * 1024 if st.get("total_kib") is not None else None, + "used_bytes": st["used_kib"] * 1024 if st.get("used_kib") is not None else None, + "available_bytes": st["avail_kib"] * 1024 if st.get("avail_kib") is not None else None, + } + + if ctype == "pve_storage_bind": sid = classification.get("origin_storage", "") st = pve_storages.get(sid) if not st: @@ -312,6 +429,45 @@ def _read_ct_proc_mounts(host_pid: str) -> list[dict[str, Any]]: return out +def _host_source_state(source: str) -> dict[str, Any]: + """Inspect a host-side bind source to detect 'zombie' binds. + + Reported by Ignacio Seijo (11/05): when the host unmounted + ``/mnt/nas1_con_backup`` the CT kept reporting it as ``mounted`` + because the bind into the CT's mount namespace was still live — + the kernel doesn't propagate the host-side umount to the child + namespace. The CT's view becomes a frozen snapshot of whatever + was under the path at bind time (usually an empty dir). + + Returns ``{exists, is_mountpoint, error}``. ``exists=False`` means + the source path is gone entirely (e.g. a USB drive that was + physically removed). ``is_mountpoint=False`` while ``exists=True`` + is the zombie-bind case the UI flags. + + Only meaningful for absolute host paths. Storage-id sources + (``local-zfs:subvol-...``) return ``{None, None, None}`` since + there is no host path to inspect. + """ + empty = {"exists": None, "is_mountpoint": None, "error": None} + if not source or not source.startswith("/"): + return empty + try: + st_exists = os.path.exists(source) + except OSError as e: + return {"exists": None, "is_mountpoint": None, "error": str(e)} + if not st_exists: + return {"exists": False, "is_mountpoint": False, "error": "path missing"} + try: + proc = subprocess.run( + ["mountpoint", "-q", source], + capture_output=True, text=True, timeout=_STAT_TIMEOUT, + ) + is_mp = (proc.returncode == 0) + return {"exists": True, "is_mountpoint": is_mp, "error": None} + except (subprocess.TimeoutExpired, OSError) as e: + return {"exists": True, "is_mountpoint": None, "error": str(e)} + + def _stat_via_host(host_pid: str, ct_target: str, timeout: int = _STAT_TIMEOUT) -> dict[str, Any]: """Stat the container-internal target through /proc//root — @@ -366,11 +522,37 @@ def get_lxc_mount_points(vmid: str) -> dict[str, Any]: out: list[dict[str, Any]] = [] matched_targets: set[str] = set() - for entry in config_entries: + # Pre-compute per-entry subprocess work in parallel so a CT with + # many mountpoints doesn't pay N×(_STAT_TIMEOUT + _STAT_TIMEOUT) + # serialised cost. The previous serial path tripped Caddy's 3s + # reverse-proxy timeout (Ignacio Seijo 11/05: "/api/lxc/210/ + # mount-points → 502 (3.00s)") on hosts with 5+ binds. ThreadPool + # is the right primitive — these are all I/O-bound `df`/`stat` + # calls hitting independent paths. + from concurrent.futures import ThreadPoolExecutor + + def _gather_one(entry): + src = entry.get("source", "") + tgt = entry.get("target", "") + classification = _classify(src, pve_storages) + capacity = _capacity_for( + src, classification, pve_storages, + config_options=entry.get("config_options", {}), + host_pid=host_pid if running else "", + target=tgt, + ) + host_src = _host_source_state(src) + live_target = bool(running and tgt and tgt in rt_by_target) + health = _stat_via_host(host_pid, tgt) if live_target else None + return entry, classification, capacity, host_src, live_target, health + + max_workers = max(2, min(8, len(config_entries) or 1)) + with ThreadPoolExecutor(max_workers=max_workers) as pool: + gathered = list(pool.map(_gather_one, config_entries)) + + for entry, cls, cap, host_src, live_target, health in gathered: source = entry.get("source", "") target = entry.get("target", "") - cls = _classify(source, pve_storages) - cap = _capacity_for(source, cls, pve_storages) item: dict[str, Any] = { "mp_index": entry.get("mp_index", ""), @@ -382,13 +564,14 @@ def get_lxc_mount_points(vmid: str) -> dict[str, Any]: "origin_label": cls.get("origin_label", source), "config_options": entry.get("config_options", {}), "config_flags": entry.get("config_flags", []), + "host_source_exists": host_src["exists"], + "host_source_is_mountpoint": host_src["is_mountpoint"], **cap, } # Runtime enrichment when CT is up. - if running and target and target in rt_by_target: + if live_target: rt = rt_by_target[target] - health = _stat_via_host(host_pid, target) item.update({ "runtime_mounted": True, "runtime_source": rt["rt_source"], @@ -416,34 +599,42 @@ def get_lxc_mount_points(vmid: str) -> dict[str, Any]: # original Sprint 13.24 issue revolves around catching them. ad_hoc: list[dict[str, Any]] = [] if running: - for rt in rt_mounts: - target = rt["rt_target"] - if target in matched_targets: - continue - if not _REMOTE_FS_RE.match(rt["rt_fstype"]): - continue - health = _stat_via_host(host_pid, target) - ad_hoc.append({ - "mp_index": "", - "source": rt["rt_source"], - "target": target, - "type": "ad_hoc", - "origin_storage": "", - "origin_storage_type": "", - "origin_label": rt["rt_source"], - "config_options": {}, - "config_flags": [], - "total_bytes": None, - "used_bytes": None, - "available_bytes": None, - "runtime_mounted": True, - "runtime_source": rt["rt_source"], - "runtime_fstype": rt["rt_fstype"], - "runtime_options": rt["rt_options"], - "runtime_readonly": rt["rt_readonly"], - "runtime_reachable": health["reachable"], - "runtime_error": health["error"], - }) + ad_hoc_candidates = [ + rt for rt in rt_mounts + if rt["rt_target"] not in matched_targets + and _REMOTE_FS_RE.match(rt["rt_fstype"]) + ] + # Same parallelisation as the configured-mp loop: stat'ing + # stale NFS exports serially can dominate the request and + # push it past the proxy timeout. + if ad_hoc_candidates: + with ThreadPoolExecutor(max_workers=max_workers) as pool: + healths = list(pool.map( + lambda rt: _stat_via_host(host_pid, rt["rt_target"]), + ad_hoc_candidates, + )) + for rt, health in zip(ad_hoc_candidates, healths): + ad_hoc.append({ + "mp_index": "", + "source": rt["rt_source"], + "target": rt["rt_target"], + "type": "ad_hoc", + "origin_storage": "", + "origin_storage_type": "", + "origin_label": rt["rt_source"], + "config_options": {}, + "config_flags": [], + "total_bytes": None, + "used_bytes": None, + "available_bytes": None, + "runtime_mounted": True, + "runtime_source": rt["rt_source"], + "runtime_fstype": rt["rt_fstype"], + "runtime_options": rt["rt_options"], + "runtime_readonly": rt["rt_readonly"], + "runtime_reachable": health["reachable"], + "runtime_error": health["error"], + }) return { "ok": True, diff --git a/AppImage/scripts/notification_channels.py b/AppImage/scripts/notification_channels.py index 8b713f34..1c67a5f4 100644 --- a/AppImage/scripts/notification_channels.py +++ b/AppImage/scripts/notification_channels.py @@ -508,14 +508,22 @@ class EmailChannel(NotificationChannel): def __init__(self, config: Dict[str, str]): super().__init__() - self.host = config.get('host', '') + self.host = (config.get('host', '') or '').strip() self.port = int(config.get('port', 587) or 587) - self.username = config.get('username', '') - self.password = config.get('password', '') - self.tls_mode = config.get('tls_mode', 'starttls') # none | starttls | ssl - self.from_address = config.get('from_address', '') + self.username = config.get('username', '') or '' + self.password = config.get('password', '') or '' + # `dict.get(k, default)` only returns default when the key is MISSING; + # if the user previously saved an empty string or null, we'd end up + # with `tls_mode=''` and silently skip STARTTLS — which causes + # `SMTPNotSupportedError: SMTP AUTH extension not supported by server` + # on Gmail/Outlook because they only advertise AUTH post-STARTTLS. + tls_raw = (config.get('tls_mode') or 'starttls').strip().lower() + if tls_raw not in ('none', 'starttls', 'ssl'): + tls_raw = 'starttls' + self.tls_mode = tls_raw + self.from_address = config.get('from_address', '') or '' self.to_addresses = self._parse_recipients(config.get('to_addresses', '')) - self.subject_prefix = config.get('subject_prefix', '[ProxMenux]') + self.subject_prefix = config.get('subject_prefix', '[ProxMenux]') or '[ProxMenux]' self.timeout = int(config.get('timeout', 10) or 10) @staticmethod @@ -529,6 +537,17 @@ class EmailChannel(NotificationChannel): return False, 'No recipients configured' if not self.from_address: return False, 'No from address configured' + # Credentials without an explicit SMTP host would silently fall back to + # `/usr/sbin/sendmail`, which ignores username/password entirely — the + # test returns OK because Postfix queued the message, but the relay is + # never authenticated and the mail rots in the local mailq. Reported by + # Ignacio Seijo: "dejando host/puerto en blanco el test pasa pero el + # correo nunca llega". + if (self.username or self.password) and not self.host: + return False, ('SMTP credentials provided but no host configured. ' + 'Set host (e.g. smtp.gmail.com) and port (587) — ' + 'without a host the message goes to the local MTA ' + 'and your username/password are ignored.') # Must have SMTP host OR local sendmail available if not self.host: import os @@ -591,8 +610,33 @@ class EmailChannel(NotificationChannel): server.ehlo() # Re-identify after TLS -- server re-announces AUTH if self.username and self.password: + # If the server doesn't advertise AUTH after our EHLO sequence, + # smtplib's `login()` raises `SMTPNotSupportedError` with the + # opaque message "SMTP AUTH extension not supported by server". + # That fired for users who left tls_mode blank or pointed at + # port 587 without STARTTLS — Gmail only advertises AUTH after + # the TLS handshake. Surface the real reason here. + if not server.has_extn('auth'): + hint = ( + f"server={self.host}:{self.port} tls_mode={self.tls_mode}" + ) + if self.tls_mode == 'none': + return 0, ( + 'SMTP server did not advertise AUTH after EHLO. ' + 'TLS is disabled — most providers (Gmail, Outlook, ' + 'Office365) only allow login after STARTTLS or SSL. ' + f'Switch TLS Mode to STARTTLS (port 587) or SSL/TLS ' + f'(port 465). [{hint}]' + ) + return 0, ( + 'SMTP server did not advertise AUTH after EHLO. ' + 'Verify the host/port/TLS combination. For Gmail use ' + 'smtp.gmail.com:587 with STARTTLS and an App Password ' + '(https://myaccount.google.com/apppasswords); for ' + f'Outlook use smtp.office365.com:587 with STARTTLS. [{hint}]' + ) server.login(self.username, self.password) - + server.send_message(msg) server.quit() server = None @@ -601,8 +645,10 @@ class EmailChannel(NotificationChannel): return 0, f'SMTP authentication failed (check username/password or app-specific password): {e}' except smtplib.SMTPNotSupportedError as e: return 0, (f'SMTP AUTH not supported by server. ' - f'This may mean the server requires OAuth2 or an App Password ' - f'instead of regular credentials: {e}') + f'TLS mode: {self.tls_mode}, port: {self.port}. ' + f'Gmail/Outlook require STARTTLS on 587 or SSL/TLS on 465. ' + f'For Gmail, generate an App Password at ' + f'https://myaccount.google.com/apppasswords. Detail: {e}') except smtplib.SMTPConnectError as e: return 0, f'SMTP connection failed: {e}' except smtplib.SMTPException as e: diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index f4e5ab6d..463e346d 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -292,6 +292,61 @@ def _record_smartd_observation_impl(title: str, message: str): print(f"[smartd_observation] Error recording smartd observation: {e}") +# ─── Vzdump activity detector (shared, restart-tolerant) ───────── +# +# A single source of truth for "is a vzdump backup job running on this +# host RIGHT NOW", consultable from any watcher and surviving Monitor +# restarts. Reads `/var/log/pve/tasks/active` directly — PVE writes the +# active UPID there at backup start and removes it on completion, so +# it persists across our process restarts. +# +# Without this, JournalWatcher's in-memory `_last_backup_job_ts` got +# reset by every Monitor restart, and any `Starting Backup of VM X` +# log lines arriving after that point were treated as standalone +# backups — emitting one `backup_start` per guest with `storage=local` +# (the fallback path that doesn't see the parent job's --storage flag). +# Reported by JC Miñarro 18/05 after a Monitor redeploy mid-job. +_VZDUMP_ACTIVE_FILE = '/var/log/pve/tasks/active' +_vzdump_active_cache_ts: float = 0 +_vzdump_active_cache_value: bool = False +_VZDUMP_ACTIVE_CACHE_TTL = 5 # seconds + + +def is_vzdump_active_on_host() -> bool: + """Return True if `/var/log/pve/tasks/active` contains an active + vzdump UPID (i.e. backup currently running). Cached 5s to avoid + hammering the file on every notification. + + Caller-safe: returns False on any I/O / parse error. + """ + global _vzdump_active_cache_ts, _vzdump_active_cache_value + now = time.time() + if now - _vzdump_active_cache_ts < _VZDUMP_ACTIVE_CACHE_TTL: + return _vzdump_active_cache_value + found = False + try: + with open(_VZDUMP_ACTIVE_FILE, 'r') as f: + for line in f: + # UPID format: UPID:node:pid:pstart:starttime:type:id:user: + if ':vzdump:' not in line: + continue + parts = line.strip().split(':') + if len(parts) < 3: + continue + try: + pid = int(parts[2], 16) # PID in UPID is hex + os.kill(pid, 0) + found = True + break + except (ValueError, ProcessLookupError, PermissionError): + continue + except (OSError, IOError): + pass + _vzdump_active_cache_ts = now + _vzdump_active_cache_value = found + return found + + # ─── Journal Watcher (Real-time) ───────────────────────────────── class JournalWatcher: @@ -1238,6 +1293,14 @@ class JournalWatcher: now = time.time() if now - self._last_backup_job_ts < self._BACKUP_JOB_SUPPRESS_WINDOW: return # Part of an active job -- already notified + # Restart-tolerant fallback: if the in-memory timestamp was + # cleared (Monitor restarted mid-job) but PVE still has an + # active vzdump UPID, this per-guest line is part of that + # job — drop it instead of emitting a wrong "Backup started + # on local" with storage default. Reported by JC Miñarro 18/05 + # after a Monitor redeploy during an active PBS backup. + if is_vzdump_active_on_host(): + return fallback_guest = fb.group(1) else: return @@ -1893,10 +1956,15 @@ class TaskWatcher: # Suppress VM/CT start/stop/shutdown while a vzdump is active. # These are backup-induced operations (mode=stop), not user actions. # Exception: if a VM/CT FAILS or has WARNINGS, that IS important. + # We check BOTH our in-memory tracking (`_is_vzdump_active`) AND + # `tasks/active` on disk (`is_vzdump_active_on_host`). The disk + # check survives Monitor restarts mid-backup, which otherwise + # cleared `_vzdump_running_since` and exposed the post-restart + # shutdown notifications to the user (JC Miñarro 18/05). _BACKUP_NOISE = {'vm_start', 'vm_stop', 'vm_shutdown', 'vm_restart', 'ct_start', 'ct_stop', 'ct_shutdown', 'ct_restart'} if event_type in _BACKUP_NOISE and not is_error and not is_warning: - if self._is_vzdump_active(): + if self._is_vzdump_active() or is_vzdump_active_on_host(): return # Suppress VM/CT stop/shutdown during host shutdown/reboot. diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py index c48628dd..ef4cd22e 100644 --- a/AppImage/scripts/notification_templates.py +++ b/AppImage/scripts/notification_templates.py @@ -223,14 +223,28 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]: else: total_time = f"{secs}s" + # ── Extract the storage target name (PBS, PBS-Cloud, local, …) ── + # PVE logs the full command on the first line: + # "INFO: starting new backup job: vzdump 104 105 --storage PBS-Cloud --mode stop" + # We surface it so the notification body can say "PBS-Cloud: vm/104/…" + # instead of the generic "PBS:" prefix when multiple PBS endpoints + # are configured. Reported by JC Miñarro 18/05. + storage_name = '' + for line in lines: + m_storage = re.search(r'--storage\s+(\S+)', line) + if m_storage: + storage_name = m_storage.group(1).strip() + break + if not vms and not total_size: return None - + return { 'vms': vms, 'total_time': total_time, 'total_size': total_size, 'vm_count': len(vms), + 'storage_name': storage_name, } @@ -277,13 +291,19 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str: if detail_line: parts.append(' | '.join(detail_line)) - # PBS/File on separate line with icon + # PBS/File on separate line with icon. When we know the + # storage name (e.g. "PBS-Cloud", "PBS-Office") prefix it so + # the user can tell which destination this archive lives in \u2014 + # critical when there are multiple PBS endpoints configured. if vm.get('filename'): fname = vm['filename'] + storage_name = parsed.get('storage_name', '') or '' if re.match(r'^(?:ct|vm)/\d+/', fname): - parts.append(f"\U0001F5C4\uFE0F PBS: {fname}") + label = storage_name if storage_name else 'PBS' + parts.append(f"\U0001F5C4\uFE0F {label}: {fname}") else: - parts.append(f"\U0001F4C1 File: {fname}") + label = storage_name if storage_name else 'File' + parts.append(f"\U0001F4C1 {label}: {fname}") # Error reason if failed if status != 'ok' and vm.get('error'): diff --git a/scripts/post_install/auto_post_install.sh b/scripts/post_install/auto_post_install.sh index 57762a4a..88e4d489 100644 --- a/scripts/post_install/auto_post_install.sh +++ b/scripts/post_install/auto_post_install.sh @@ -601,7 +601,7 @@ EOF install_log2ram_auto() { - local FUNC_VERSION="1.1" + local FUNC_VERSION="1.2" # description: Install Log2RAM with size auto-tuned to host RAM (128M/256M/512M); SSD/M.2 detection skips on rotational disks. # ── Reinstall detection ───────────────────────────────────────────────── # If log2ram was previously installed by ProxMenux, skip hardware detection @@ -732,6 +732,13 @@ EOF cat > /usr/local/bin/log2ram-check.sh <<'EOF' #!/usr/bin/env bash +# v1.2 — `log2ram write` only copies tmpfs→disk; it does NOT shrink +# the tmpfs. When journald or pveproxy/access.log grow past their +# limits the tmpfs hit 100% and PVE crashed with "No space left on +# device" on Shell open (community-reported: JC Miñarro, Nicolás P. +# de A., 17-18/05). We now vacuum the journal and truncate the +# non-rotating logs that actually consume the tmpfs before calling +# `log2ram write`. PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" CONF_FILE="/etc/log2ram.conf" @@ -742,7 +749,8 @@ L2R_BIN="$(command -v log2ram || true)" SIZE_MiB="$(grep -E '^SIZE=' "$CONF_FILE" 2>/dev/null | cut -d'=' -f2 | tr -dc '0-9')" [[ -z "$SIZE_MiB" ]] && SIZE_MiB=128 LIMIT_BYTES=$(( SIZE_MiB * 1024 * 1024 )) -THRESHOLD_BYTES=$(( LIMIT_BYTES * 95 / 100 )) +WARN_BYTES=$(( LIMIT_BYTES * 80 / 100 )) +EMERGENCY_BYTES=$(( LIMIT_BYTES * 92 / 100 )) USED_BYTES="$(df -B1 --output=used /var/log 2>/dev/null | tail -1 | tr -dc '0-9')" [[ -z "$USED_BYTES" ]] && exit 0 @@ -751,8 +759,24 @@ LOCK="/run/log2ram-check.lock" exec 9>"$LOCK" 2>/dev/null || exit 0 flock -n 9 || exit 0 -if (( USED_BYTES > THRESHOLD_BYTES )); then - "$L2R_BIN" write 2>/dev/null || true +# `log2ram write` alone leaves the tmpfs full. Real recovery requires: +# (a) journal vacuum — journald respects --vacuum-size unconditionally, +# unlike SystemMaxUse which only enforces on rotation boundaries; +# (b) truncating logs that aren't rotated by logrotate (pveproxy, pveam); +# (c) THEN syncing to disk so the persistent copy reflects reality. +if (( USED_BYTES > EMERGENCY_BYTES )); then + SAFE_JOURNAL_MB=$(( SIZE_MiB * 5 / 100 )) + [[ "$SAFE_JOURNAL_MB" -lt 16 ]] && SAFE_JOURNAL_MB=16 + journalctl --vacuum-size="${SAFE_JOURNAL_MB}M" >/dev/null 2>&1 || true + : > /var/log/pveproxy/access.log 2>/dev/null || true + : > /var/log/pveproxy/error.log 2>/dev/null || true + : > /var/log/pveam.log 2>/dev/null || true + "$L2R_BIN" write 2>/dev/null || true +elif (( USED_BYTES > WARN_BYTES )); then + SOFT_JOURNAL_MB=$(( SIZE_MiB * 30 / 100 )) + [[ "$SOFT_JOURNAL_MB" -lt 32 ]] && SOFT_JOURNAL_MB=32 + journalctl --vacuum-size="${SOFT_JOURNAL_MB}M" >/dev/null 2>&1 || true + "$L2R_BIN" write 2>/dev/null || true fi EOF chmod +x /usr/local/bin/log2ram-check.sh @@ -770,7 +794,7 @@ EOF chown root:root /etc/cron.d/log2ram-auto-sync systemctl restart cron >/dev/null 2>&1 || true - msg_ok "$(translate "Auto-sync enabled when /var/log exceeds 95% of") $LOG2RAM_SIZE" + msg_ok "$(translate "Auto-sync enabled when /var/log exceeds 80% of") $LOG2RAM_SIZE" msg_info "$(translate "Adjusting systemd-journald limits to match Log2RAM size...")" @@ -801,6 +825,11 @@ Storage=persistent SplitMode=none RateLimitIntervalSec=30s RateLimitBurst=1000 +ForwardToSyslog=no +ForwardToWall=no +Seal=no +Compress=yes +SystemMaxUse=${USE_MB}M SystemKeepFree=${KEEP_MB}M RuntimeMaxUse=${RUNTIME_MB}M # MaxLevelStore=info: required for ProxMenux Monitor log display and Fail2Ban detection. diff --git a/scripts/post_install/customizable_post_install.sh b/scripts/post_install/customizable_post_install.sh index 81e467cc..309adfb7 100644 --- a/scripts/post_install/customizable_post_install.sh +++ b/scripts/post_install/customizable_post_install.sh @@ -2350,7 +2350,7 @@ update_pve_appliance_manager() { configure_log2ram() { - local FUNC_VERSION="1.1" + local FUNC_VERSION="1.2" # description: Install Log2RAM with user-chosen RAM size; prompts for size and SSD/M.2 awareness before applying. msg_info2 "$(translate "Preparing Log2RAM configuration")" sleep 1 @@ -2477,6 +2477,13 @@ EOF if [[ "$ENABLE_AUTOSYNC" == true ]]; then cat > /usr/local/bin/log2ram-check.sh <<'EOF' #!/usr/bin/env bash +# v1.2 — `log2ram write` only copies tmpfs→disk; it does NOT shrink +# the tmpfs. When journald or pveproxy/access.log grow past their +# limits the tmpfs hit 100% and PVE crashed with "No space left on +# device" on Shell open (community-reported: JC Miñarro, Nicolás P. +# de A., 17-18/05). We now vacuum the journal and truncate the +# non-rotating logs that actually consume the tmpfs before calling +# `log2ram write`. PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" CONF_FILE="/etc/log2ram.conf" L2R_BIN="$(command -v log2ram || true)" @@ -2486,7 +2493,8 @@ L2R_BIN="$(command -v log2ram || true)" SIZE_MiB="$(grep -E '^SIZE=' "$CONF_FILE" 2>/dev/null | cut -d'=' -f2 | tr -dc '0-9')" [[ -z "$SIZE_MiB" ]] && SIZE_MiB=128 LIMIT_BYTES=$(( SIZE_MiB * 1024 * 1024 )) -THRESHOLD_BYTES=$(( LIMIT_BYTES * 90 / 100 )) +WARN_BYTES=$(( LIMIT_BYTES * 80 / 100 )) +EMERGENCY_BYTES=$(( LIMIT_BYTES * 92 / 100 )) USED_BYTES="$(df -B1 --output=used /var/log 2>/dev/null | tail -1 | tr -dc '0-9')" [[ -z "$USED_BYTES" ]] && exit 0 @@ -2495,8 +2503,24 @@ LOCK="/run/log2ram-check.lock" exec 9>"$LOCK" 2>/dev/null || exit 0 flock -n 9 || exit 0 -if (( USED_BYTES > THRESHOLD_BYTES )); then - "$L2R_BIN" write 2>/dev/null || true +# `log2ram write` alone leaves the tmpfs full. Real recovery requires: +# (a) journal vacuum — journald respects --vacuum-size unconditionally, +# unlike SystemMaxUse which only enforces on rotation boundaries; +# (b) truncating logs that aren't rotated by logrotate (pveproxy, pveam); +# (c) THEN syncing to disk so the persistent copy reflects reality. +if (( USED_BYTES > EMERGENCY_BYTES )); then + SAFE_JOURNAL_MB=$(( SIZE_MiB * 5 / 100 )) + [[ "$SAFE_JOURNAL_MB" -lt 16 ]] && SAFE_JOURNAL_MB=16 + journalctl --vacuum-size="${SAFE_JOURNAL_MB}M" >/dev/null 2>&1 || true + : > /var/log/pveproxy/access.log 2>/dev/null || true + : > /var/log/pveproxy/error.log 2>/dev/null || true + : > /var/log/pveam.log 2>/dev/null || true + "$L2R_BIN" write 2>/dev/null || true +elif (( USED_BYTES > WARN_BYTES )); then + SOFT_JOURNAL_MB=$(( SIZE_MiB * 30 / 100 )) + [[ "$SOFT_JOURNAL_MB" -lt 32 ]] && SOFT_JOURNAL_MB=32 + journalctl --vacuum-size="${SOFT_JOURNAL_MB}M" >/dev/null 2>&1 || true + "$L2R_BIN" write 2>/dev/null || true fi EOF chmod +x /usr/local/bin/log2ram-check.sh @@ -2510,7 +2534,7 @@ MAILTO="" EOF chmod 0644 /etc/cron.d/log2ram-auto-sync chown root:root /etc/cron.d/log2ram-auto-sync - msg_ok "$(translate "Auto-sync enabled when /var/log exceeds 90% of") $LOG2RAM_SIZE" + msg_ok "$(translate "Auto-sync enabled when /var/log exceeds 80% of") $LOG2RAM_SIZE" else rm -f /usr/local/bin/log2ram-check.sh /etc/cron.d/log2ram-auto-sync 2>/dev/null || true msg_info2 "$(translate "Auto-sync was not enabled")"