1.2.1.1-beta: notification + LXC + post-install fixes

- flask_notification_routes: PVE webhook X-Webhook-Secret written in
  standard base64 so PVE can decode it (GH #198)
- notification_channels: Gmail SMTP App Password handling — normalize
  tls_mode (None/empty → starttls), reject creds without host (false-
  positive sendmail delivery), surface "AUTH not advertised" hint
- notification_events: is_vzdump_active_on_host() reads /var/log/pve/
  tasks/active directly so backup_start fallback and vm_shutdown
  suppression survive a Monitor restart mid-backup
- notification_templates: extract --storage flag from vzdump log →
  "PBS-Cloud: vm/104/…" instead of generic "PBS:" prefix when multiple
  PBS endpoints exist
- health_monitor: pve_storage_capacity + zfs_pool_capacity respect
  per-item dismiss (don't keep category WARNING/CRITICAL after user
  dismisses); updates_check cache invalidated when /var/log/apt/
  history.log mtime advances
- lxc_mount_points: PVE volume size from subvol quota (df via
  /proc/<host_pid>/root/<target> + lxc.conf size=NNNG fallback);
  host_source_state detects "host detached" zombie binds; per-mount
  subprocess work parallelised via ThreadPoolExecutor so a CT with
  many bind mounts doesn't trip the Caddy 3s reverse-proxy timeout
- virtual-machines: "host detached" badge on bind mounts whose host
  source path disappeared
- auto/customizable_post_install: log2ram FUNC_VERSION 1.1 → 1.2; new
  log2ram-check.sh vacuums journal + truncates non-rotating logs
  (pveproxy/access.log, pveam.log) instead of only calling
  `log2ram write` (which leaves the tmpfs full); auto flow gains the
  missing SystemMaxUse in /etc/systemd/journald.conf

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MacRimi
2026-05-19 00:06:49 +02:00
parent 81844fa456
commit 6eb1312c61
11 changed files with 548 additions and 92 deletions
Binary file not shown.
+1 -1
View File
@@ -1 +1 @@
6249ae8d51e0d7dbd3035ba49f4244ff035c2c6d97d5c55f69ab0dac6a4ea021 ProxMenux-1.2.1.1-beta.AppImage
70a510025df81652319d16e0d36e77bea95a965163608232e9aca60ada9c9fbf ProxMenux-1.2.1.1-beta.AppImage
+24 -7
View File
@@ -170,6 +170,12 @@ interface LxcMountPoint {
runtime_readonly?: boolean
runtime_reachable?: boolean
runtime_error?: string | null
// Sprint 14.x: host-side bind source state. Detects the case where the
// CT still reports a bind as mounted even though the host already
// umounted the source (Ignacio Seijo 11/05). Null = N/A (PVE volume,
// not a host path).
host_source_exists?: boolean | null
host_source_is_mountpoint?: boolean | null
}
const fetcher = async (url: string) => {
@@ -321,9 +327,18 @@ function MountPointCard({ mp }: { mp: LxcMountPoint }) {
const isStale = mp.runtime_reachable === false
const isReadonly = !isStale && mp.runtime_readonly === true
const isDivergent = mp.runtime_mounted === false // configured but not actually mounted
// "Zombie bind": the host removed the source (e.g. USB pulled, manual
// umount) but the CT mount namespace still shows the bind as mounted.
// Reported by Ignacio Seijo (11/05). Only flag host_bind /
// pve_storage_bind sources — PVE volume sources have no host path
// and `host_source_exists` comes back null for them.
const isHostDetached =
mp.runtime_mounted === true &&
(mp.type === "host_bind" || mp.type === "pve_storage_bind") &&
mp.host_source_exists === false
const cardClasses = isStale
? "border-red-500/50 bg-red-500/5"
: isDivergent
: isDivergent || isHostDetached
? "border-amber-500/40 bg-amber-500/5"
: isReadonly
? "border-amber-500/30 bg-amber-500/5"
@@ -395,7 +410,7 @@ function MountPointCard({ mp }: { mp: LxcMountPoint }) {
className={
isStale
? "bg-red-500/10 text-red-500 border-red-500/20"
: isDivergent
: isDivergent || isHostDetached
? "bg-amber-500/10 text-amber-500 border-amber-500/20"
: isReadonly
? "bg-amber-500/10 text-amber-500 border-amber-500/20"
@@ -408,11 +423,13 @@ function MountPointCard({ mp }: { mp: LxcMountPoint }) {
? "stale"
: isDivergent
? "not mounted"
: isReadonly
? "read-only"
: mp.runtime_mounted === null
? "stopped"
: "mounted"}
: isHostDetached
? "host detached"
: isReadonly
? "read-only"
: mp.runtime_mounted === null
? "stopped"
: "mounted"}
</Badge>
</div>
+26 -2
View File
@@ -191,6 +191,24 @@ def _bad_request(msg: str):
return jsonify({'error': msg}), 400
def _is_loopback_addr(value: str) -> bool:
"""Return True for IPv4, IPv6 and IPv4-mapped loopback addresses.
When Flask is bound to ``::`` for dual-stack support, an HTTP request
sent to ``127.0.0.1`` can be reported as ``::ffff:127.0.0.1``. Treat it
as local so the PVE webhook keeps the intended localhost trust path.
"""
try:
import ipaddress
addr = ipaddress.ip_address(value)
if addr.is_loopback:
return True
ipv4_mapped = getattr(addr, 'ipv4_mapped', None)
return bool(ipv4_mapped and ipv4_mapped.is_loopback)
except ValueError:
return value == 'localhost'
def _validate_event_type(value: str) -> bool:
return isinstance(value, str) and bool(_EVENT_TYPE_RE.match(value))
@@ -983,9 +1001,15 @@ def setup_pve_webhook_core() -> dict:
# endpoint depends entirely on the localhost-bypass and any move
# to a non-loopback bind silently breaks auth. Audit Tier 3.1 —
# `setup_pve_webhook_core` no escribe secret en priv cfg.
#
# PVE stores `secret value=` in STANDARD base64 and decodes it
# before emitting the header. Writing the raw token here triggered
# `could not decode UTF8 string from base64, key 'X-Webhook-Secret' (500)`
# whenever `token_urlsafe` produced `-` or `_` chars (GH #198).
secret_b64 = base64.b64encode(secret.encode()).decode()
priv_block = (
f"webhook: {_PVE_ENDPOINT_ID}\n"
f" secret name=X-Webhook-Secret,value={secret}\n"
f" secret name=X-Webhook-Secret,value={secret_b64}\n"
)
if priv_text is not None:
@@ -1225,7 +1249,7 @@ def proxmox_webhook():
_reject = lambda code, error, status: (jsonify({'accepted': False, 'error': error}), status)
client_ip = request.remote_addr or ''
is_localhost = client_ip in ('127.0.0.1', '::1')
is_localhost = _is_loopback_addr(client_ip)
# CSRF defence-in-depth: reject `application/x-www-form-urlencoded`
# bodies. PVE always sends `application/json`; form-encoded bodies
+56 -19
View File
@@ -4197,22 +4197,37 @@ class HealthMonitor:
"""
cache_key = 'updates_check'
current_time = time.time()
# Cache for 10 minutes
if cache_key in self.last_check_times:
if current_time - self.last_check_times[cache_key] < 600:
return self.cached_results.get(cache_key)
apt_history_path = '/var/log/apt/history.log'
# Detect a manual `apt install/upgrade` since the last check by
# comparing /var/log/apt/history.log's mtime against the cache
# timestamp. apt appends to this file on every transaction, so a
# newer mtime means the local package state changed and the cached
# pending-updates list is stale. Reported by Alberto (14/5): the
# dashboard tile kept showing pending updates ~hours after he ran
# `apt upgrade` manually. Cheap stat call; runs at most once per
# /api/health/full request.
history_mtime = None
try:
if os.path.exists(apt_history_path):
history_mtime = os.path.getmtime(apt_history_path)
except Exception:
history_mtime = None
if cache_key in self.last_check_times:
cache_ts = self.last_check_times[cache_key]
history_changed = (history_mtime is not None and history_mtime > cache_ts)
if not history_changed and current_time - cache_ts < 600:
return self.cached_results.get(cache_key)
try:
apt_history_path = '/var/log/apt/history.log'
last_update_days = None
sec_result = None
age_result = None
if os.path.exists(apt_history_path):
if history_mtime is not None:
try:
mtime = os.path.getmtime(apt_history_path)
days_since_update = (current_time - mtime) / 86400
days_since_update = (current_time - history_mtime) / 86400
last_update_days = int(days_since_update)
except Exception:
pass
@@ -5775,12 +5790,24 @@ class HealthMonitor:
'used_bytes': used,
}
error_key = f'pve_storage_full_{name}'
# If the user already dismissed this exact error (within the
# suppression window), don't count it toward the category
# severity badge. Without this guard the storage section stayed
# WARNING/CRITICAL forever even after dismiss because the
# underlying % is unchanged — `record_error` correctly returned
# `skipped_acknowledged` to silence the notification side, but
# the dashboard counter ignored that signal and the user saw
# "Storage: 1 Warning" with no way to clear it. Reported on
# the community channel re: PBS-lleno (17-18/05).
is_dismissed = health_persistence.is_error_acknowledged(error_key)
if pct >= crit_pct:
entry['status'] = 'CRITICAL'
entry['status'] = 'CRITICAL' if not is_dismissed else 'INFO'
entry['error_key'] = error_key
entry['dismissable'] = True
entry['dismissed'] = is_dismissed
checks[label] = entry
critical_labels.append(label)
if not is_dismissed:
critical_labels.append(label)
emitted_keys.add(error_key)
health_persistence.record_error(
error_key=error_key,
@@ -5790,11 +5817,13 @@ class HealthMonitor:
details=entry,
)
elif pct >= warn_pct:
entry['status'] = 'WARNING'
entry['status'] = 'WARNING' if not is_dismissed else 'INFO'
entry['error_key'] = error_key
entry['dismissable'] = True
entry['dismissed'] = is_dismissed
checks[label] = entry
warning_labels.append(label)
if not is_dismissed:
warning_labels.append(label)
emitted_keys.add(error_key)
health_persistence.record_error(
error_key=error_key,
@@ -5877,12 +5906,18 @@ class HealthMonitor:
'pool_name': name,
}
error_key = f'zfs_pool_full_{name}'
# Same dismiss-respect as `_check_pve_storage_capacity`. A pool
# that the user dismissed keeps its underlying % but should no
# longer flip the category badge to WARNING/CRITICAL.
is_dismissed = health_persistence.is_error_acknowledged(error_key)
if pct >= crit_pct:
entry['status'] = 'CRITICAL'
entry['status'] = 'CRITICAL' if not is_dismissed else 'INFO'
entry['error_key'] = error_key
entry['dismissable'] = True
entry['dismissed'] = is_dismissed
checks[name] = entry
critical_labels.append(name)
if not is_dismissed:
critical_labels.append(name)
emitted_keys.add(error_key)
health_persistence.record_error(
error_key=error_key,
@@ -5892,11 +5927,13 @@ class HealthMonitor:
details=entry,
)
elif pct >= warn_pct:
entry['status'] = 'WARNING'
entry['status'] = 'WARNING' if not is_dismissed else 'INFO'
entry['error_key'] = error_key
entry['dismissable'] = True
entry['dismissed'] = is_dismissed
checks[name] = entry
warning_labels.append(name)
if not is_dismissed:
warning_labels.append(name)
emitted_keys.add(error_key)
health_persistence.record_error(
error_key=error_key,
+230 -39
View File
@@ -231,17 +231,134 @@ def _df_path(path: str) -> dict[str, Optional[int]]:
return empty
_SIZE_UNIT_TO_BYTES = {
"": 1, "B": 1,
"K": 1024, "KB": 1024, "KIB": 1024,
"M": 1024 ** 2, "MB": 1024 ** 2, "MIB": 1024 ** 2,
"G": 1024 ** 3, "GB": 1024 ** 3, "GIB": 1024 ** 3,
"T": 1024 ** 4, "TB": 1024 ** 4, "TIB": 1024 ** 4,
}
def _parse_pve_size(value: str) -> Optional[int]:
"""Convert PVE-style sizes (``150G``, ``32M``, ``2T``) to bytes.
PVE stores volume sizes in lxc.conf as ``size=<num><unit>`` where
unit is a single letter from {K,M,G,T} (powers of 1024). Returns
None for empty/unparseable input — callers fall through to
pvesm-based totals.
"""
if value is None:
return None
s = str(value).strip().upper()
if not s:
return None
m = re.match(r"^(\d+(?:\.\d+)?)\s*([KMGT]?I?B?)$", s)
if not m:
return None
try:
magnitude = float(m.group(1))
except ValueError:
return None
unit = m.group(2) or ""
multiplier = _SIZE_UNIT_TO_BYTES.get(unit)
if multiplier is None:
return None
return int(magnitude * multiplier)
def _df_via_host_pid(host_pid: str, ct_target: str) -> dict[str, Optional[int]]:
"""``df`` the CT-internal path via ``/proc/<pid>/root`` so we get
the filesystem as the container sees it, including ZFS dataset
quotas. Used for ``pve_volume`` mounts whose ``pvesm status``
numbers reflect the whole storage pool instead of the per-subvol
quota — without this the UI showed 851 GB total for a 150 GB ZFS
subvol because pvesm reports the rpool's free space.
"""
empty = {"total_bytes": None, "used_bytes": None, "available_bytes": None}
if not host_pid or not ct_target:
return empty
full = f"/proc/{host_pid}/root{ct_target}"
try:
proc = subprocess.run(
["df", "-B1", "--output=size,used,avail", full],
capture_output=True, text=True, timeout=_STAT_TIMEOUT,
)
if proc.returncode != 0:
return empty
lines = [ln for ln in proc.stdout.strip().splitlines() if ln.strip()]
if len(lines) < 2:
return empty
parts = lines[-1].split()
if len(parts) < 3:
return empty
return {
"total_bytes": int(parts[0]),
"used_bytes": int(parts[1]),
"available_bytes": int(parts[2]),
}
except (subprocess.TimeoutExpired, OSError, ValueError):
return empty
def _capacity_for(source: str, classification: dict[str, Any],
pve_storages: dict[str, dict[str, Any]]) -> dict[str, Optional[int]]:
pve_storages: dict[str, dict[str, Any]],
config_options: Optional[dict[str, Any]] = None,
host_pid: str = "",
target: str = "") -> dict[str, Optional[int]]:
"""Return total/used/available bytes for the *source* of a mount.
``pve_volume`` and ``pve_storage_bind`` reuse the numbers from
``pvesm status`` (already loaded once). ``host_bind`` falls back to
``df`` of the host path. None values mean the lookup didn't
succeed and the UI will render n/a.
``pve_volume`` quota handling (Sprint 14.x — Ignacio Seijo 10/05):
A ``mp6: local-zfs:subvol-310-disk-1,size=150G,...`` line carved
out a 150 GB subvol from a 1 TB pool. The previous code read
``pvesm status local-zfs`` and reported 851 GB total / 19% used —
reflecting the whole pool, not the subvol. We now prefer, in
order:
1) ``df`` of ``/proc/<host_pid>/root/<target>`` when the CT is
up — gives the correct view-from-inside numbers including
the quota.
2) ``size=<N>`` from lxc.conf as the total; usage is unknown
when the CT isn't running, so the UI shows total only.
3) Fallback to ``pvesm status`` (pool numbers) when the entry
has no declared size — that's the legacy behaviour for
sizeless block volumes (lvm raw, rbd).
``pve_storage_bind`` mounts (NFS, CIFS at ``/mnt/pve/...``) keep
the pvesm-based numbers because the storage IS the source of truth
for those.
``host_bind`` falls back to ``df`` of the host path. None values
mean the lookup didn't succeed and the UI will render n/a.
"""
ctype = classification.get("type")
if ctype in ("pve_volume", "pve_storage_bind"):
config_options = config_options or {}
declared_size_bytes = _parse_pve_size(config_options.get("size"))
if ctype == "pve_volume":
# 1) Live numbers from inside the CT (respects quota).
if host_pid and target:
live = _df_via_host_pid(host_pid, target)
if live.get("total_bytes") is not None:
return live
# 2) CT down (or df failed): expose declared quota as total.
if declared_size_bytes is not None:
return {
"total_bytes": declared_size_bytes,
"used_bytes": None,
"available_bytes": None,
}
# 3) No quota declared: legacy pool-level numbers.
sid = classification.get("origin_storage", "")
st = pve_storages.get(sid)
if not st:
return {"total_bytes": None, "used_bytes": None, "available_bytes": None}
return {
"total_bytes": st["total_kib"] * 1024 if st.get("total_kib") is not None else None,
"used_bytes": st["used_kib"] * 1024 if st.get("used_kib") is not None else None,
"available_bytes": st["avail_kib"] * 1024 if st.get("avail_kib") is not None else None,
}
if ctype == "pve_storage_bind":
sid = classification.get("origin_storage", "")
st = pve_storages.get(sid)
if not st:
@@ -312,6 +429,45 @@ def _read_ct_proc_mounts(host_pid: str) -> list[dict[str, Any]]:
return out
def _host_source_state(source: str) -> dict[str, Any]:
"""Inspect a host-side bind source to detect 'zombie' binds.
Reported by Ignacio Seijo (11/05): when the host unmounted
``/mnt/nas1_con_backup`` the CT kept reporting it as ``mounted``
because the bind into the CT's mount namespace was still live —
the kernel doesn't propagate the host-side umount to the child
namespace. The CT's view becomes a frozen snapshot of whatever
was under the path at bind time (usually an empty dir).
Returns ``{exists, is_mountpoint, error}``. ``exists=False`` means
the source path is gone entirely (e.g. a USB drive that was
physically removed). ``is_mountpoint=False`` while ``exists=True``
is the zombie-bind case the UI flags.
Only meaningful for absolute host paths. Storage-id sources
(``local-zfs:subvol-...``) return ``{None, None, None}`` since
there is no host path to inspect.
"""
empty = {"exists": None, "is_mountpoint": None, "error": None}
if not source or not source.startswith("/"):
return empty
try:
st_exists = os.path.exists(source)
except OSError as e:
return {"exists": None, "is_mountpoint": None, "error": str(e)}
if not st_exists:
return {"exists": False, "is_mountpoint": False, "error": "path missing"}
try:
proc = subprocess.run(
["mountpoint", "-q", source],
capture_output=True, text=True, timeout=_STAT_TIMEOUT,
)
is_mp = (proc.returncode == 0)
return {"exists": True, "is_mountpoint": is_mp, "error": None}
except (subprocess.TimeoutExpired, OSError) as e:
return {"exists": True, "is_mountpoint": None, "error": str(e)}
def _stat_via_host(host_pid: str, ct_target: str,
timeout: int = _STAT_TIMEOUT) -> dict[str, Any]:
"""Stat the container-internal target through /proc/<pid>/root —
@@ -366,11 +522,37 @@ def get_lxc_mount_points(vmid: str) -> dict[str, Any]:
out: list[dict[str, Any]] = []
matched_targets: set[str] = set()
for entry in config_entries:
# Pre-compute per-entry subprocess work in parallel so a CT with
# many mountpoints doesn't pay N×(_STAT_TIMEOUT + _STAT_TIMEOUT)
# serialised cost. The previous serial path tripped Caddy's 3s
# reverse-proxy timeout (Ignacio Seijo 11/05: "/api/lxc/210/
# mount-points → 502 (3.00s)") on hosts with 5+ binds. ThreadPool
# is the right primitive — these are all I/O-bound `df`/`stat`
# calls hitting independent paths.
from concurrent.futures import ThreadPoolExecutor
def _gather_one(entry):
src = entry.get("source", "")
tgt = entry.get("target", "")
classification = _classify(src, pve_storages)
capacity = _capacity_for(
src, classification, pve_storages,
config_options=entry.get("config_options", {}),
host_pid=host_pid if running else "",
target=tgt,
)
host_src = _host_source_state(src)
live_target = bool(running and tgt and tgt in rt_by_target)
health = _stat_via_host(host_pid, tgt) if live_target else None
return entry, classification, capacity, host_src, live_target, health
max_workers = max(2, min(8, len(config_entries) or 1))
with ThreadPoolExecutor(max_workers=max_workers) as pool:
gathered = list(pool.map(_gather_one, config_entries))
for entry, cls, cap, host_src, live_target, health in gathered:
source = entry.get("source", "")
target = entry.get("target", "")
cls = _classify(source, pve_storages)
cap = _capacity_for(source, cls, pve_storages)
item: dict[str, Any] = {
"mp_index": entry.get("mp_index", ""),
@@ -382,13 +564,14 @@ def get_lxc_mount_points(vmid: str) -> dict[str, Any]:
"origin_label": cls.get("origin_label", source),
"config_options": entry.get("config_options", {}),
"config_flags": entry.get("config_flags", []),
"host_source_exists": host_src["exists"],
"host_source_is_mountpoint": host_src["is_mountpoint"],
**cap,
}
# Runtime enrichment when CT is up.
if running and target and target in rt_by_target:
if live_target:
rt = rt_by_target[target]
health = _stat_via_host(host_pid, target)
item.update({
"runtime_mounted": True,
"runtime_source": rt["rt_source"],
@@ -416,34 +599,42 @@ def get_lxc_mount_points(vmid: str) -> dict[str, Any]:
# original Sprint 13.24 issue revolves around catching them.
ad_hoc: list[dict[str, Any]] = []
if running:
for rt in rt_mounts:
target = rt["rt_target"]
if target in matched_targets:
continue
if not _REMOTE_FS_RE.match(rt["rt_fstype"]):
continue
health = _stat_via_host(host_pid, target)
ad_hoc.append({
"mp_index": "",
"source": rt["rt_source"],
"target": target,
"type": "ad_hoc",
"origin_storage": "",
"origin_storage_type": "",
"origin_label": rt["rt_source"],
"config_options": {},
"config_flags": [],
"total_bytes": None,
"used_bytes": None,
"available_bytes": None,
"runtime_mounted": True,
"runtime_source": rt["rt_source"],
"runtime_fstype": rt["rt_fstype"],
"runtime_options": rt["rt_options"],
"runtime_readonly": rt["rt_readonly"],
"runtime_reachable": health["reachable"],
"runtime_error": health["error"],
})
ad_hoc_candidates = [
rt for rt in rt_mounts
if rt["rt_target"] not in matched_targets
and _REMOTE_FS_RE.match(rt["rt_fstype"])
]
# Same parallelisation as the configured-mp loop: stat'ing
# stale NFS exports serially can dominate the request and
# push it past the proxy timeout.
if ad_hoc_candidates:
with ThreadPoolExecutor(max_workers=max_workers) as pool:
healths = list(pool.map(
lambda rt: _stat_via_host(host_pid, rt["rt_target"]),
ad_hoc_candidates,
))
for rt, health in zip(ad_hoc_candidates, healths):
ad_hoc.append({
"mp_index": "",
"source": rt["rt_source"],
"target": rt["rt_target"],
"type": "ad_hoc",
"origin_storage": "",
"origin_storage_type": "",
"origin_label": rt["rt_source"],
"config_options": {},
"config_flags": [],
"total_bytes": None,
"used_bytes": None,
"available_bytes": None,
"runtime_mounted": True,
"runtime_source": rt["rt_source"],
"runtime_fstype": rt["rt_fstype"],
"runtime_options": rt["rt_options"],
"runtime_readonly": rt["rt_readonly"],
"runtime_reachable": health["reachable"],
"runtime_error": health["error"],
})
return {
"ok": True,
+55 -9
View File
@@ -508,14 +508,22 @@ class EmailChannel(NotificationChannel):
def __init__(self, config: Dict[str, str]):
super().__init__()
self.host = config.get('host', '')
self.host = (config.get('host', '') or '').strip()
self.port = int(config.get('port', 587) or 587)
self.username = config.get('username', '')
self.password = config.get('password', '')
self.tls_mode = config.get('tls_mode', 'starttls') # none | starttls | ssl
self.from_address = config.get('from_address', '')
self.username = config.get('username', '') or ''
self.password = config.get('password', '') or ''
# `dict.get(k, default)` only returns default when the key is MISSING;
# if the user previously saved an empty string or null, we'd end up
# with `tls_mode=''` and silently skip STARTTLS — which causes
# `SMTPNotSupportedError: SMTP AUTH extension not supported by server`
# on Gmail/Outlook because they only advertise AUTH post-STARTTLS.
tls_raw = (config.get('tls_mode') or 'starttls').strip().lower()
if tls_raw not in ('none', 'starttls', 'ssl'):
tls_raw = 'starttls'
self.tls_mode = tls_raw
self.from_address = config.get('from_address', '') or ''
self.to_addresses = self._parse_recipients(config.get('to_addresses', ''))
self.subject_prefix = config.get('subject_prefix', '[ProxMenux]')
self.subject_prefix = config.get('subject_prefix', '[ProxMenux]') or '[ProxMenux]'
self.timeout = int(config.get('timeout', 10) or 10)
@staticmethod
@@ -529,6 +537,17 @@ class EmailChannel(NotificationChannel):
return False, 'No recipients configured'
if not self.from_address:
return False, 'No from address configured'
# Credentials without an explicit SMTP host would silently fall back to
# `/usr/sbin/sendmail`, which ignores username/password entirely — the
# test returns OK because Postfix queued the message, but the relay is
# never authenticated and the mail rots in the local mailq. Reported by
# Ignacio Seijo: "dejando host/puerto en blanco el test pasa pero el
# correo nunca llega".
if (self.username or self.password) and not self.host:
return False, ('SMTP credentials provided but no host configured. '
'Set host (e.g. smtp.gmail.com) and port (587) — '
'without a host the message goes to the local MTA '
'and your username/password are ignored.')
# Must have SMTP host OR local sendmail available
if not self.host:
import os
@@ -591,8 +610,33 @@ class EmailChannel(NotificationChannel):
server.ehlo() # Re-identify after TLS -- server re-announces AUTH
if self.username and self.password:
# If the server doesn't advertise AUTH after our EHLO sequence,
# smtplib's `login()` raises `SMTPNotSupportedError` with the
# opaque message "SMTP AUTH extension not supported by server".
# That fired for users who left tls_mode blank or pointed at
# port 587 without STARTTLS — Gmail only advertises AUTH after
# the TLS handshake. Surface the real reason here.
if not server.has_extn('auth'):
hint = (
f"server={self.host}:{self.port} tls_mode={self.tls_mode}"
)
if self.tls_mode == 'none':
return 0, (
'SMTP server did not advertise AUTH after EHLO. '
'TLS is disabled — most providers (Gmail, Outlook, '
'Office365) only allow login after STARTTLS or SSL. '
f'Switch TLS Mode to STARTTLS (port 587) or SSL/TLS '
f'(port 465). [{hint}]'
)
return 0, (
'SMTP server did not advertise AUTH after EHLO. '
'Verify the host/port/TLS combination. For Gmail use '
'smtp.gmail.com:587 with STARTTLS and an App Password '
'(https://myaccount.google.com/apppasswords); for '
f'Outlook use smtp.office365.com:587 with STARTTLS. [{hint}]'
)
server.login(self.username, self.password)
server.send_message(msg)
server.quit()
server = None
@@ -601,8 +645,10 @@ class EmailChannel(NotificationChannel):
return 0, f'SMTP authentication failed (check username/password or app-specific password): {e}'
except smtplib.SMTPNotSupportedError as e:
return 0, (f'SMTP AUTH not supported by server. '
f'This may mean the server requires OAuth2 or an App Password '
f'instead of regular credentials: {e}')
f'TLS mode: {self.tls_mode}, port: {self.port}. '
f'Gmail/Outlook require STARTTLS on 587 or SSL/TLS on 465. '
f'For Gmail, generate an App Password at '
f'https://myaccount.google.com/apppasswords. Detail: {e}')
except smtplib.SMTPConnectError as e:
return 0, f'SMTP connection failed: {e}'
except smtplib.SMTPException as e:
+69 -1
View File
@@ -292,6 +292,61 @@ def _record_smartd_observation_impl(title: str, message: str):
print(f"[smartd_observation] Error recording smartd observation: {e}")
# ─── Vzdump activity detector (shared, restart-tolerant) ─────────
#
# A single source of truth for "is a vzdump backup job running on this
# host RIGHT NOW", consultable from any watcher and surviving Monitor
# restarts. Reads `/var/log/pve/tasks/active` directly — PVE writes the
# active UPID there at backup start and removes it on completion, so
# it persists across our process restarts.
#
# Without this, JournalWatcher's in-memory `_last_backup_job_ts` got
# reset by every Monitor restart, and any `Starting Backup of VM X`
# log lines arriving after that point were treated as standalone
# backups — emitting one `backup_start` per guest with `storage=local`
# (the fallback path that doesn't see the parent job's --storage flag).
# Reported by JC Miñarro 18/05 after a Monitor redeploy mid-job.
_VZDUMP_ACTIVE_FILE = '/var/log/pve/tasks/active'
_vzdump_active_cache_ts: float = 0
_vzdump_active_cache_value: bool = False
_VZDUMP_ACTIVE_CACHE_TTL = 5 # seconds
def is_vzdump_active_on_host() -> bool:
"""Return True if `/var/log/pve/tasks/active` contains an active
vzdump UPID (i.e. backup currently running). Cached 5s to avoid
hammering the file on every notification.
Caller-safe: returns False on any I/O / parse error.
"""
global _vzdump_active_cache_ts, _vzdump_active_cache_value
now = time.time()
if now - _vzdump_active_cache_ts < _VZDUMP_ACTIVE_CACHE_TTL:
return _vzdump_active_cache_value
found = False
try:
with open(_VZDUMP_ACTIVE_FILE, 'r') as f:
for line in f:
# UPID format: UPID:node:pid:pstart:starttime:type:id:user:
if ':vzdump:' not in line:
continue
parts = line.strip().split(':')
if len(parts) < 3:
continue
try:
pid = int(parts[2], 16) # PID in UPID is hex
os.kill(pid, 0)
found = True
break
except (ValueError, ProcessLookupError, PermissionError):
continue
except (OSError, IOError):
pass
_vzdump_active_cache_ts = now
_vzdump_active_cache_value = found
return found
# ─── Journal Watcher (Real-time) ─────────────────────────────────
class JournalWatcher:
@@ -1238,6 +1293,14 @@ class JournalWatcher:
now = time.time()
if now - self._last_backup_job_ts < self._BACKUP_JOB_SUPPRESS_WINDOW:
return # Part of an active job -- already notified
# Restart-tolerant fallback: if the in-memory timestamp was
# cleared (Monitor restarted mid-job) but PVE still has an
# active vzdump UPID, this per-guest line is part of that
# job — drop it instead of emitting a wrong "Backup started
# on local" with storage default. Reported by JC Miñarro 18/05
# after a Monitor redeploy during an active PBS backup.
if is_vzdump_active_on_host():
return
fallback_guest = fb.group(1)
else:
return
@@ -1893,10 +1956,15 @@ class TaskWatcher:
# Suppress VM/CT start/stop/shutdown while a vzdump is active.
# These are backup-induced operations (mode=stop), not user actions.
# Exception: if a VM/CT FAILS or has WARNINGS, that IS important.
# We check BOTH our in-memory tracking (`_is_vzdump_active`) AND
# `tasks/active` on disk (`is_vzdump_active_on_host`). The disk
# check survives Monitor restarts mid-backup, which otherwise
# cleared `_vzdump_running_since` and exposed the post-restart
# shutdown notifications to the user (JC Miñarro 18/05).
_BACKUP_NOISE = {'vm_start', 'vm_stop', 'vm_shutdown', 'vm_restart',
'ct_start', 'ct_stop', 'ct_shutdown', 'ct_restart'}
if event_type in _BACKUP_NOISE and not is_error and not is_warning:
if self._is_vzdump_active():
if self._is_vzdump_active() or is_vzdump_active_on_host():
return
# Suppress VM/CT stop/shutdown during host shutdown/reboot.
+24 -4
View File
@@ -223,14 +223,28 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
else:
total_time = f"{secs}s"
# ── Extract the storage target name (PBS, PBS-Cloud, local, …) ──
# PVE logs the full command on the first line:
# "INFO: starting new backup job: vzdump 104 105 --storage PBS-Cloud --mode stop"
# We surface it so the notification body can say "PBS-Cloud: vm/104/…"
# instead of the generic "PBS:" prefix when multiple PBS endpoints
# are configured. Reported by JC Miñarro 18/05.
storage_name = ''
for line in lines:
m_storage = re.search(r'--storage\s+(\S+)', line)
if m_storage:
storage_name = m_storage.group(1).strip()
break
if not vms and not total_size:
return None
return {
'vms': vms,
'total_time': total_time,
'total_size': total_size,
'vm_count': len(vms),
'storage_name': storage_name,
}
@@ -277,13 +291,19 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str:
if detail_line:
parts.append(' | '.join(detail_line))
# PBS/File on separate line with icon
# PBS/File on separate line with icon. When we know the
# storage name (e.g. "PBS-Cloud", "PBS-Office") prefix it so
# the user can tell which destination this archive lives in \u2014
# critical when there are multiple PBS endpoints configured.
if vm.get('filename'):
fname = vm['filename']
storage_name = parsed.get('storage_name', '') or ''
if re.match(r'^(?:ct|vm)/\d+/', fname):
parts.append(f"\U0001F5C4\uFE0F PBS: {fname}")
label = storage_name if storage_name else 'PBS'
parts.append(f"\U0001F5C4\uFE0F {label}: {fname}")
else:
parts.append(f"\U0001F4C1 File: {fname}")
label = storage_name if storage_name else 'File'
parts.append(f"\U0001F4C1 {label}: {fname}")
# Error reason if failed
if status != 'ok' and vm.get('error'):