diff --git a/AppImage/ProxMenux-1.2.1.2-beta.AppImage b/AppImage/ProxMenux-1.2.1.2-beta.AppImage index f846c098..2fbb7ad0 100755 Binary files a/AppImage/ProxMenux-1.2.1.2-beta.AppImage and b/AppImage/ProxMenux-1.2.1.2-beta.AppImage differ diff --git a/AppImage/ProxMenux-Monitor.AppImage.sha256 b/AppImage/ProxMenux-Monitor.AppImage.sha256 index 42aff57b..00c9bd75 100644 --- a/AppImage/ProxMenux-Monitor.AppImage.sha256 +++ b/AppImage/ProxMenux-Monitor.AppImage.sha256 @@ -1 +1 @@ -91da610577f6c7254db6941685d901afb0a5de228f1fcac02c4b6e2e72a63683 ProxMenux-1.2.1.2-beta.AppImage +1b72c977163192fba07cb6e18e8539d37c90e9624ff22e3ca2cc3c8a55ce8a8e ProxMenux-1.2.1.2-beta.AppImage diff --git a/AppImage/scripts/managed_installs.py b/AppImage/scripts/managed_installs.py index a88173d7..bfaef854 100644 --- a/AppImage/scripts/managed_installs.py +++ b/AppImage/scripts/managed_installs.py @@ -381,10 +381,14 @@ def _detect_lxc_containers() -> list[dict]: a CT is seen. CT reinstalls with a different OS will keep the old family cached until the user resets the registry — acceptable trade-off vs paying the probe cost every 24h cycle. - """ - if not _lxc_updates_notification_enabled(): - return [] + Detection runs unconditionally so the dashboard always reflects + pending updates on running CTs. The `lxc_updates_available` + notification toggle only gates the *delivery* of the notification + (see _check_managed_installs_updates in notification_events.py), + not the detection — that keeps the toggle semantics consistent with + every other update stream (NVIDIA, Coral, post-install). + """ # Read existing registry so we can preserve cached `_os_family`. # No lock needed here — we only inspect; the framework holds the # write lock when it merges back our results in detect_and_register. diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index 7befcce8..5c17acce 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -2287,11 +2287,6 @@ class PollingCollector: # updates. The fingerprint encodes the per-CT state so a stable # batch doesn't re-notify while a meaningful change does. self._notified_lxc_batch: str | None = None - # Track previous state of the LXC-updates notification toggle - # so a user enabling it post-startup bypasses the 24h gate - # ONCE — the next polling cycle runs a fresh detection without - # waiting up to a day. Cleared after the forced run completes. - self._lxc_was_enabled: bool = False # Track notified ProxMenux versions to avoid duplicates self._notified_proxmenux_version: str | None = None self._notified_proxmenux_beta_version: str | None = None @@ -2664,16 +2659,30 @@ class PollingCollector: category = old_meta.get('category', '') reason = old_meta.get('reason', '') first_seen = old_meta.get('first_seen', '') - + # Skip recovery for INFO/OK - they never triggered an alert if old_meta.get('severity', '') in ('INFO', 'OK'): self._last_notified.pop(key, None) continue - + # Skip recovery on first poll (we don't know what was before) if not self._first_poll_done: self._last_notified.pop(key, None) continue + + # Skip recovery when the persisted snapshot lost the context + # for this error (reason / category both empty). Emitting a + # blank "Resuelto -" message with "Condition resolved" body + # adds no value — the user can't tell which error went away. + # Happens after a long Monitor downtime when the snapshot was + # serialized between polls without reason/category populated, + # or when a future code path slips a key into _known_errors + # without the full metadata. Drop the tracking entry silently + # so we never re-fire on the same incomplete record. + if not reason and not category: + self._last_notified.pop(key, None) + self._notified_severity.pop(key, None) + continue # Skip recovery if the error was manually acknowledged (dismissed) # by the user. Acknowledged != resolved -- the problem may still @@ -2765,15 +2774,25 @@ class PollingCollector: original_severity = self._notified_severity.get( key, old_meta.get('severity', 'WARNING'), ) + # Defensive defaults — the template uses `{category}` and + # `{duration}` in both the title and the body; an empty + # `category` produced the cosmetic "Resolved - " with a + # trailing dash (and "The issue has been resolved" with + # a double space) on 2026-05-21. The Fix A filter above + # already drops the worst case (reason AND category empty); + # this layer fills the remaining edge cases when only one + # of the two is missing. + category_label = category or 'health' + duration_label = duration or 'unknown' data = { 'hostname': self._hostname, - 'category': category, + 'category': category_label, 'reason': clean_reason, 'error_key': key, 'severity': 'OK', 'original_severity': original_severity, 'first_seen': first_seen, - 'duration': duration, + 'duration': duration_label, 'is_recovery': True, } @@ -3243,23 +3262,7 @@ class PollingCollector: """ now = time.time() - # Detect OFF→ON transition of the LXC update toggle. Without - # this, the first polling cycle after service start always sets - # the 24h gate — so a user who enables the toggle later (which - # is the normal flow, since the toggle defaults to OFF) would - # have to wait up to 24h or restart the service before the - # detector ran. A one-shot bypass on the transition fixes that - # without weakening the 24h cadence in steady state. - try: - import managed_installs as _mi - lxc_enabled_now = _mi._lxc_updates_notification_enabled() - except Exception: - lxc_enabled_now = False - lxc_just_enabled = lxc_enabled_now and not self._lxc_was_enabled - self._lxc_was_enabled = lxc_enabled_now - - if (not lxc_just_enabled - and now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL): + if now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL: return self._last_managed_check = now @@ -3312,9 +3315,21 @@ class PollingCollector: # updates. The batch fingerprint is recomputed every cycle and # compared with the last notified one — if the set of CTs or # their per-CT fingerprints changed, we notify again. - if lxc_updates: + # + # Detection itself runs unconditionally so the dashboard always + # shows pending updates; the `lxc_updates_available` toggle only + # controls whether a notification is *emitted*. If it's off we + # skip the emit (and the dedup stamp) so re-enabling the toggle + # later fires the next pending batch immediately. + try: + import managed_installs as _mi + lxc_notif_enabled = _mi._lxc_updates_notification_enabled() + except Exception: + lxc_notif_enabled = False + + if lxc_updates and lxc_notif_enabled: self._emit_lxc_updates_batch(lxc_updates) - else: + elif not lxc_updates: # Empty batch — clear the dedup so a fresh batch later fires # a new notification even with the same CTs/versions. self._notified_lxc_batch = None @@ -3579,7 +3594,27 @@ class PollingCollector: print(f"[PollingCollector] Failed to save known_errors meta: {e}") def _load_last_notified(self): - """Load per-error notification timestamps from DB on startup.""" + """Load per-error notification timestamps from DB on startup. + + Reads only the per-key cooldown timestamps so the same-key + 24h gate survives a restart. **Does NOT touch `_known_errors`** + — that snapshot is rebuilt exclusively by `_load_known_errors_meta` + (which carries the full reason / category / severity payload + needed to emit a meaningful recovery later). + + The original implementation also injected synthetic rows into + `_known_errors` from this table, with `{'error_key': ek, + 'first_seen': }` and nothing else. That made the + startup path believe the host had a populated baseline of + active errors, so the first post-restart poll computed + `resolved_keys = synthetic_dummies − current_keys` and emitted + recovery notifications with empty `reason` / `category` / + `severity` fields — the “Resuelto -” / “Condición + resuelta” ghosts the user saw on 2026-05-21. Stale rows + in this table never expire on their own (the bug was eternal: + every restart re-triggered the ghost), so the fix is to never + treat this table as a source of `_known_errors` content. + """ try: db_path = Path('/usr/local/share/proxmenux/health_monitor.db') if not db_path.exists(): @@ -3594,8 +3629,6 @@ class PollingCollector: for fp, ts in cursor.fetchall(): error_key = fp.replace('health_', '', 1) self._last_notified[error_key] = ts - # _known_errors is a dict (not a set), store minimal metadata - self._known_errors[error_key] = {'error_key': error_key, 'first_seen': ts} conn.close() except Exception as e: print(f"[PollingCollector] Failed to load last_notified: {e}") diff --git a/AppImage/scripts/notification_manager.py b/AppImage/scripts/notification_manager.py index 7839922d..57267dbc 100644 --- a/AppImage/scripts/notification_manager.py +++ b/AppImage/scripts/notification_manager.py @@ -484,6 +484,24 @@ AGGREGATION_RULES = { # burst, avoiding notification floods from any source. _DEFAULT_AGGREGATION = {'window': 60, 'min_count': 2, 'burst_type': 'burst_generic'} +# Event types the burst aggregator must never group. The default +# catch-all (`_DEFAULT_AGGREGATION`) treats anything unlisted as +# group-able, which is the right default for *negative* signals +# (failures, errors, intrusion attempts) but produces noise when +# applied to positive / informational events the user wants to see +# individually. +# +# Concrete failure mode that motivated this list: on 2026-05-21 a +# post-restart resolved-detection batch emitted two `error_resolved` +# events for two stale keys at the same time. The aggregator paired +# them and the user received a useless "+1 error_resolved en 0s +# (2 en total) — Eventos adicionales: Condición resuelta" burst on +# top of the original recovery message. The signal value of a +# recovery is per-event; collapsing them adds zero information. +_AGGREGATION_EXEMPT_EVENTS = frozenset({ + 'error_resolved', +}) + class BurstAggregator: """Accumulates similar events in a time window, then sends a single summary. @@ -517,7 +535,16 @@ class BurstAggregator: ALL event types are aggregated: specific rules from AGGREGATION_RULES take priority, otherwise the _DEFAULT_AGGREGATION catch-all applies. This prevents notification floods from any source. + + Exception: event types listed in `_AGGREGATION_EXEMPT_EVENTS` + bypass aggregation entirely and are returned to the dispatcher + as-is. Used for positive/informational events (recoveries, + scheduled-task completions) where collapsing into a burst + summary destroys signal value. """ + if event.event_type in _AGGREGATION_EXEMPT_EVENTS: + return event + rule = AGGREGATION_RULES.get(event.event_type, _DEFAULT_AGGREGATION) bucket_key = f"{event.event_type}:{event.data.get('hostname', '')}"