Update AppImage 1..2.1.2

This commit is contained in:
MacRimi
2026-05-21 19:31:47 +02:00
parent 0651f57e86
commit 3e9dd599a6
5 changed files with 99 additions and 35 deletions
Binary file not shown.
+1 -1
View File
@@ -1 +1 @@
91da610577f6c7254db6941685d901afb0a5de228f1fcac02c4b6e2e72a63683 ProxMenux-1.2.1.2-beta.AppImage
1b72c977163192fba07cb6e18e8539d37c90e9624ff22e3ca2cc3c8a55ce8a8e ProxMenux-1.2.1.2-beta.AppImage
+7 -3
View File
@@ -381,10 +381,14 @@ def _detect_lxc_containers() -> list[dict]:
a CT is seen. CT reinstalls with a different OS will keep the old
family cached until the user resets the registry — acceptable
trade-off vs paying the probe cost every 24h cycle.
"""
if not _lxc_updates_notification_enabled():
return []
Detection runs unconditionally so the dashboard always reflects
pending updates on running CTs. The `lxc_updates_available`
notification toggle only gates the *delivery* of the notification
(see _check_managed_installs_updates in notification_events.py),
not the detection — that keeps the toggle semantics consistent with
every other update stream (NVIDIA, Coral, post-install).
"""
# Read existing registry so we can preserve cached `_os_family`.
# No lock needed here — we only inspect; the framework holds the
# write lock when it merges back our results in detect_and_register.
+64 -31
View File
@@ -2287,11 +2287,6 @@ class PollingCollector:
# updates. The fingerprint encodes the per-CT state so a stable
# batch doesn't re-notify while a meaningful change does.
self._notified_lxc_batch: str | None = None
# Track previous state of the LXC-updates notification toggle
# so a user enabling it post-startup bypasses the 24h gate
# ONCE — the next polling cycle runs a fresh detection without
# waiting up to a day. Cleared after the forced run completes.
self._lxc_was_enabled: bool = False
# Track notified ProxMenux versions to avoid duplicates
self._notified_proxmenux_version: str | None = None
self._notified_proxmenux_beta_version: str | None = None
@@ -2664,16 +2659,30 @@ class PollingCollector:
category = old_meta.get('category', '')
reason = old_meta.get('reason', '')
first_seen = old_meta.get('first_seen', '')
# Skip recovery for INFO/OK - they never triggered an alert
if old_meta.get('severity', '') in ('INFO', 'OK'):
self._last_notified.pop(key, None)
continue
# Skip recovery on first poll (we don't know what was before)
if not self._first_poll_done:
self._last_notified.pop(key, None)
continue
# Skip recovery when the persisted snapshot lost the context
# for this error (reason / category both empty). Emitting a
# blank "Resuelto -" message with "Condition resolved" body
# adds no value — the user can't tell which error went away.
# Happens after a long Monitor downtime when the snapshot was
# serialized between polls without reason/category populated,
# or when a future code path slips a key into _known_errors
# without the full metadata. Drop the tracking entry silently
# so we never re-fire on the same incomplete record.
if not reason and not category:
self._last_notified.pop(key, None)
self._notified_severity.pop(key, None)
continue
# Skip recovery if the error was manually acknowledged (dismissed)
# by the user. Acknowledged != resolved -- the problem may still
@@ -2765,15 +2774,25 @@ class PollingCollector:
original_severity = self._notified_severity.get(
key, old_meta.get('severity', 'WARNING'),
)
# Defensive defaults — the template uses `{category}` and
# `{duration}` in both the title and the body; an empty
# `category` produced the cosmetic "Resolved - " with a
# trailing dash (and "The issue has been resolved" with
# a double space) on 2026-05-21. The Fix A filter above
# already drops the worst case (reason AND category empty);
# this layer fills the remaining edge cases when only one
# of the two is missing.
category_label = category or 'health'
duration_label = duration or 'unknown'
data = {
'hostname': self._hostname,
'category': category,
'category': category_label,
'reason': clean_reason,
'error_key': key,
'severity': 'OK',
'original_severity': original_severity,
'first_seen': first_seen,
'duration': duration,
'duration': duration_label,
'is_recovery': True,
}
@@ -3243,23 +3262,7 @@ class PollingCollector:
"""
now = time.time()
# Detect OFF→ON transition of the LXC update toggle. Without
# this, the first polling cycle after service start always sets
# the 24h gate — so a user who enables the toggle later (which
# is the normal flow, since the toggle defaults to OFF) would
# have to wait up to 24h or restart the service before the
# detector ran. A one-shot bypass on the transition fixes that
# without weakening the 24h cadence in steady state.
try:
import managed_installs as _mi
lxc_enabled_now = _mi._lxc_updates_notification_enabled()
except Exception:
lxc_enabled_now = False
lxc_just_enabled = lxc_enabled_now and not self._lxc_was_enabled
self._lxc_was_enabled = lxc_enabled_now
if (not lxc_just_enabled
and now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL):
if now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL:
return
self._last_managed_check = now
@@ -3312,9 +3315,21 @@ class PollingCollector:
# updates. The batch fingerprint is recomputed every cycle and
# compared with the last notified one — if the set of CTs or
# their per-CT fingerprints changed, we notify again.
if lxc_updates:
#
# Detection itself runs unconditionally so the dashboard always
# shows pending updates; the `lxc_updates_available` toggle only
# controls whether a notification is *emitted*. If it's off we
# skip the emit (and the dedup stamp) so re-enabling the toggle
# later fires the next pending batch immediately.
try:
import managed_installs as _mi
lxc_notif_enabled = _mi._lxc_updates_notification_enabled()
except Exception:
lxc_notif_enabled = False
if lxc_updates and lxc_notif_enabled:
self._emit_lxc_updates_batch(lxc_updates)
else:
elif not lxc_updates:
# Empty batch — clear the dedup so a fresh batch later fires
# a new notification even with the same CTs/versions.
self._notified_lxc_batch = None
@@ -3579,7 +3594,27 @@ class PollingCollector:
print(f"[PollingCollector] Failed to save known_errors meta: {e}")
def _load_last_notified(self):
"""Load per-error notification timestamps from DB on startup."""
"""Load per-error notification timestamps from DB on startup.
Reads only the per-key cooldown timestamps so the same-key
24h gate survives a restart. **Does NOT touch `_known_errors`**
— that snapshot is rebuilt exclusively by `_load_known_errors_meta`
(which carries the full reason / category / severity payload
needed to emit a meaningful recovery later).
The original implementation also injected synthetic rows into
`_known_errors` from this table, with `{'error_key': ek,
'first_seen': <epoch int>}` and nothing else. That made the
startup path believe the host had a populated baseline of
active errors, so the first post-restart poll computed
`resolved_keys = synthetic_dummies current_keys` and emitted
recovery notifications with empty `reason` / `category` /
`severity` fields — the &ldquo;Resuelto -&rdquo; / &ldquo;Condición
resuelta&rdquo; ghosts the user saw on 2026-05-21. Stale rows
in this table never expire on their own (the bug was eternal:
every restart re-triggered the ghost), so the fix is to never
treat this table as a source of `_known_errors` content.
"""
try:
db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
if not db_path.exists():
@@ -3594,8 +3629,6 @@ class PollingCollector:
for fp, ts in cursor.fetchall():
error_key = fp.replace('health_', '', 1)
self._last_notified[error_key] = ts
# _known_errors is a dict (not a set), store minimal metadata
self._known_errors[error_key] = {'error_key': error_key, 'first_seen': ts}
conn.close()
except Exception as e:
print(f"[PollingCollector] Failed to load last_notified: {e}")
+27
View File
@@ -484,6 +484,24 @@ AGGREGATION_RULES = {
# burst, avoiding notification floods from any source.
_DEFAULT_AGGREGATION = {'window': 60, 'min_count': 2, 'burst_type': 'burst_generic'}
# Event types the burst aggregator must never group. The default
# catch-all (`_DEFAULT_AGGREGATION`) treats anything unlisted as
# group-able, which is the right default for *negative* signals
# (failures, errors, intrusion attempts) but produces noise when
# applied to positive / informational events the user wants to see
# individually.
#
# Concrete failure mode that motivated this list: on 2026-05-21 a
# post-restart resolved-detection batch emitted two `error_resolved`
# events for two stale keys at the same time. The aggregator paired
# them and the user received a useless "+1 error_resolved en 0s
# (2 en total) — Eventos adicionales: Condición resuelta" burst on
# top of the original recovery message. The signal value of a
# recovery is per-event; collapsing them adds zero information.
_AGGREGATION_EXEMPT_EVENTS = frozenset({
'error_resolved',
})
class BurstAggregator:
"""Accumulates similar events in a time window, then sends a single summary.
@@ -517,7 +535,16 @@ class BurstAggregator:
ALL event types are aggregated: specific rules from AGGREGATION_RULES
take priority, otherwise the _DEFAULT_AGGREGATION catch-all applies.
This prevents notification floods from any source.
Exception: event types listed in `_AGGREGATION_EXEMPT_EVENTS`
bypass aggregation entirely and are returned to the dispatcher
as-is. Used for positive/informational events (recoveries,
scheduled-task completions) where collapsing into a burst
summary destroys signal value.
"""
if event.event_type in _AGGREGATION_EXEMPT_EVENTS:
return event
rule = AGGREGATION_RULES.get(event.event_type, _DEFAULT_AGGREGATION)
bucket_key = f"{event.event_type}:{event.data.get('hostname', '')}"