Update AppImage 1..2.1.2

2026-05-22 16:44:48 +00:00 · 2026-05-21 19:31:47 +02:00
parent 0651f57e86
commit 3e9dd599a6
5 changed files with 99 additions and 35 deletions
@@ -381,10 +381,14 @@ def _detect_lxc_containers() -> list[dict]:
    a CT is seen. CT reinstalls with a different OS will keep the old
    family cached until the user resets the registry — acceptable
    trade-off vs paying the probe cost every 24h cycle.
-    """
-    if not _lxc_updates_notification_enabled():
-        return []

+    Detection runs unconditionally so the dashboard always reflects
+    pending updates on running CTs. The `lxc_updates_available`
+    notification toggle only gates the *delivery* of the notification
+    (see _check_managed_installs_updates in notification_events.py),
+    not the detection — that keeps the toggle semantics consistent with
+    every other update stream (NVIDIA, Coral, post-install).
+    """
    # Read existing registry so we can preserve cached `_os_family`.
    # No lock needed here — we only inspect; the framework holds the
    # write lock when it merges back our results in detect_and_register.
@@ -2287,11 +2287,6 @@ class PollingCollector:
        # updates. The fingerprint encodes the per-CT state so a stable
        # batch doesn't re-notify while a meaningful change does.
        self._notified_lxc_batch: str | None = None
-        # Track previous state of the LXC-updates notification toggle
-        # so a user enabling it post-startup bypasses the 24h gate
-        # ONCE — the next polling cycle runs a fresh detection without
-        # waiting up to a day. Cleared after the forced run completes.
-        self._lxc_was_enabled: bool = False
        # Track notified ProxMenux versions to avoid duplicates
        self._notified_proxmenux_version: str | None = None
        self._notified_proxmenux_beta_version: str | None = None
@@ -2664,16 +2659,30 @@ class PollingCollector:
            category = old_meta.get('category', '')
            reason = old_meta.get('reason', '')
            first_seen = old_meta.get('first_seen', '')
-            
+
            # Skip recovery for INFO/OK - they never triggered an alert
            if old_meta.get('severity', '') in ('INFO', 'OK'):
                self._last_notified.pop(key, None)
                continue
-            
+
            # Skip recovery on first poll (we don't know what was before)
            if not self._first_poll_done:
                self._last_notified.pop(key, None)
                continue
+
+            # Skip recovery when the persisted snapshot lost the context
+            # for this error (reason / category both empty). Emitting a
+            # blank "Resuelto -" message with "Condition resolved" body
+            # adds no value — the user can't tell which error went away.
+            # Happens after a long Monitor downtime when the snapshot was
+            # serialized between polls without reason/category populated,
+            # or when a future code path slips a key into _known_errors
+            # without the full metadata. Drop the tracking entry silently
+            # so we never re-fire on the same incomplete record.
+            if not reason and not category:
+                self._last_notified.pop(key, None)
+                self._notified_severity.pop(key, None)
+                continue
            
            # Skip recovery if the error was manually acknowledged (dismissed)
            # by the user. Acknowledged != resolved -- the problem may still
@@ -2765,15 +2774,25 @@ class PollingCollector:
            original_severity = self._notified_severity.get(
                key, old_meta.get('severity', 'WARNING'),
            )
+            # Defensive defaults — the template uses `{category}` and
+            # `{duration}` in both the title and the body; an empty
+            # `category` produced the cosmetic "Resolved - " with a
+            # trailing dash (and "The  issue has been resolved" with
+            # a double space) on 2026-05-21. The Fix A filter above
+            # already drops the worst case (reason AND category empty);
+            # this layer fills the remaining edge cases when only one
+            # of the two is missing.
+            category_label = category or 'health'
+            duration_label = duration or 'unknown'
            data = {
                'hostname': self._hostname,
-                'category': category,
+                'category': category_label,
                'reason': clean_reason,
                'error_key': key,
                'severity': 'OK',
                'original_severity': original_severity,
                'first_seen': first_seen,
-                'duration': duration,
+                'duration': duration_label,
                'is_recovery': True,
            }

@@ -3243,23 +3262,7 @@ class PollingCollector:
        """
        now = time.time()

-        # Detect OFF→ON transition of the LXC update toggle. Without
-        # this, the first polling cycle after service start always sets
-        # the 24h gate — so a user who enables the toggle later (which
-        # is the normal flow, since the toggle defaults to OFF) would
-        # have to wait up to 24h or restart the service before the
-        # detector ran. A one-shot bypass on the transition fixes that
-        # without weakening the 24h cadence in steady state.
-        try:
-            import managed_installs as _mi
-            lxc_enabled_now = _mi._lxc_updates_notification_enabled()
-        except Exception:
-            lxc_enabled_now = False
-        lxc_just_enabled = lxc_enabled_now and not self._lxc_was_enabled
-        self._lxc_was_enabled = lxc_enabled_now
-
-        if (not lxc_just_enabled
-                and now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL):
+        if now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL:
            return
        self._last_managed_check = now

@@ -3312,9 +3315,21 @@ class PollingCollector:
        # updates. The batch fingerprint is recomputed every cycle and
        # compared with the last notified one — if the set of CTs or
        # their per-CT fingerprints changed, we notify again.
-        if lxc_updates:
+        #
+        # Detection itself runs unconditionally so the dashboard always
+        # shows pending updates; the `lxc_updates_available` toggle only
+        # controls whether a notification is *emitted*. If it's off we
+        # skip the emit (and the dedup stamp) so re-enabling the toggle
+        # later fires the next pending batch immediately.
+        try:
+            import managed_installs as _mi
+            lxc_notif_enabled = _mi._lxc_updates_notification_enabled()
+        except Exception:
+            lxc_notif_enabled = False
+
+        if lxc_updates and lxc_notif_enabled:
            self._emit_lxc_updates_batch(lxc_updates)
-        else:
+        elif not lxc_updates:
            # Empty batch — clear the dedup so a fresh batch later fires
            # a new notification even with the same CTs/versions.
            self._notified_lxc_batch = None
@@ -3579,7 +3594,27 @@ class PollingCollector:
            print(f"[PollingCollector] Failed to save known_errors meta: {e}")

    def _load_last_notified(self):
-        """Load per-error notification timestamps from DB on startup."""
+        """Load per-error notification timestamps from DB on startup.
+
+        Reads only the per-key cooldown timestamps so the same-key
+        24h gate survives a restart. **Does NOT touch `_known_errors`**
+        — that snapshot is rebuilt exclusively by `_load_known_errors_meta`
+        (which carries the full reason / category / severity payload
+        needed to emit a meaningful recovery later).
+
+        The original implementation also injected synthetic rows into
+        `_known_errors` from this table, with `{'error_key': ek,
+        'first_seen': <epoch int>}` and nothing else. That made the
+        startup path believe the host had a populated baseline of
+        active errors, so the first post-restart poll computed
+        `resolved_keys = synthetic_dummies − current_keys` and emitted
+        recovery notifications with empty `reason` / `category` /
+        `severity` fields — the &ldquo;Resuelto -&rdquo; / &ldquo;Condición
+        resuelta&rdquo; ghosts the user saw on 2026-05-21. Stale rows
+        in this table never expire on their own (the bug was eternal:
+        every restart re-triggered the ghost), so the fix is to never
+        treat this table as a source of `_known_errors` content.
+        """
        try:
            db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
            if not db_path.exists():
@@ -3594,8 +3629,6 @@ class PollingCollector:
            for fp, ts in cursor.fetchall():
                error_key = fp.replace('health_', '', 1)
                self._last_notified[error_key] = ts
-                # _known_errors is a dict (not a set), store minimal metadata
-                self._known_errors[error_key] = {'error_key': error_key, 'first_seen': ts}
            conn.close()
        except Exception as e:
            print(f"[PollingCollector] Failed to load last_notified: {e}")
@@ -484,6 +484,24 @@ AGGREGATION_RULES = {
 # burst, avoiding notification floods from any source.
 _DEFAULT_AGGREGATION = {'window': 60, 'min_count': 2, 'burst_type': 'burst_generic'}

+# Event types the burst aggregator must never group. The default
+# catch-all (`_DEFAULT_AGGREGATION`) treats anything unlisted as
+# group-able, which is the right default for *negative* signals
+# (failures, errors, intrusion attempts) but produces noise when
+# applied to positive / informational events the user wants to see
+# individually.
+#
+# Concrete failure mode that motivated this list: on 2026-05-21 a
+# post-restart resolved-detection batch emitted two `error_resolved`
+# events for two stale keys at the same time. The aggregator paired
+# them and the user received a useless "+1 error_resolved en 0s
+# (2 en total) — Eventos adicionales: Condición resuelta" burst on
+# top of the original recovery message. The signal value of a
+# recovery is per-event; collapsing them adds zero information.
+_AGGREGATION_EXEMPT_EVENTS = frozenset({
+    'error_resolved',
+})
+

 class BurstAggregator:
    """Accumulates similar events in a time window, then sends a single summary.
@@ -517,7 +535,16 @@ class BurstAggregator:
        ALL event types are aggregated: specific rules from AGGREGATION_RULES
        take priority, otherwise the _DEFAULT_AGGREGATION catch-all applies.
        This prevents notification floods from any source.
+
+        Exception: event types listed in `_AGGREGATION_EXEMPT_EVENTS`
+        bypass aggregation entirely and are returned to the dispatcher
+        as-is. Used for positive/informational events (recoveries,
+        scheduled-task completions) where collapsing into a burst
+        summary destroys signal value.
        """
+        if event.event_type in _AGGREGATION_EXEMPT_EVENTS:
+            return event
+
        rule = AGGREGATION_RULES.get(event.event_type, _DEFAULT_AGGREGATION)

        bucket_key = f"{event.event_type}:{event.data.get('hostname', '')}"