Update AppImage 1..2.1.2

2026-05-22 16:44:48 +00:00 · 2026-05-21 19:31:47 +02:00
parent 0651f57e86
commit 3e9dd599a6
5 changed files with 99 additions and 35 deletions
@@ -1 +1 @@
-91da610577f6c7254db6941685d901afb0a5de228f1fcac02c4b6e2e72a63683  ProxMenux-1.2.1.2-beta.AppImage
+1b72c977163192fba07cb6e18e8539d37c90e9624ff22e3ca2cc3c8a55ce8a8e  ProxMenux-1.2.1.2-beta.AppImage
@@ -381,10 +381,14 @@ def _detect_lxc_containers() -> list[dict]:
    a CT is seen. CT reinstalls with a different OS will keep the old
    family cached until the user resets the registry — acceptable
    trade-off vs paying the probe cost every 24h cycle.
    """
    if not _lxc_updates_notification_enabled():
        return []
    Detection runs unconditionally so the dashboard always reflects
    pending updates on running CTs. The `lxc_updates_available`
    notification toggle only gates the *delivery* of the notification
    (see _check_managed_installs_updates in notification_events.py),
    not the detection — that keeps the toggle semantics consistent with
    every other update stream (NVIDIA, Coral, post-install).
    """
    # Read existing registry so we can preserve cached `_os_family`.
    # No lock needed here — we only inspect; the framework holds the
    # write lock when it merges back our results in detect_and_register.
@@ -2287,11 +2287,6 @@ class PollingCollector:
        # updates. The fingerprint encodes the per-CT state so a stable
        # batch doesn't re-notify while a meaningful change does.
        self._notified_lxc_batch: str | None = None
        # Track previous state of the LXC-updates notification toggle
        # so a user enabling it post-startup bypasses the 24h gate
        # ONCE — the next polling cycle runs a fresh detection without
        # waiting up to a day. Cleared after the forced run completes.
        self._lxc_was_enabled: bool = False
        # Track notified ProxMenux versions to avoid duplicates
        self._notified_proxmenux_version: str | None = None
        self._notified_proxmenux_beta_version: str | None = None
@@ -2675,6 +2670,20 @@ class PollingCollector:
                self._last_notified.pop(key, None)
                continue
            # Skip recovery when the persisted snapshot lost the context
            # for this error (reason / category both empty). Emitting a
            # blank "Resuelto -" message with "Condition resolved" body
            # adds no value — the user can't tell which error went away.
            # Happens after a long Monitor downtime when the snapshot was
            # serialized between polls without reason/category populated,
            # or when a future code path slips a key into _known_errors
            # without the full metadata. Drop the tracking entry silently
            # so we never re-fire on the same incomplete record.
            if not reason and not category:
                self._last_notified.pop(key, None)
                self._notified_severity.pop(key, None)
                continue
            # Skip recovery if the error was manually acknowledged (dismissed)
            # by the user. Acknowledged != resolved -- the problem may still
            # exist, the user just chose to suppress notifications for it.
@@ -2765,15 +2774,25 @@ class PollingCollector:
            original_severity = self._notified_severity.get(
                key, old_meta.get('severity', 'WARNING'),
            )
            # Defensive defaults — the template uses `{category}` and
            # `{duration}` in both the title and the body; an empty
            # `category` produced the cosmetic "Resolved - " with a
            # trailing dash (and "The  issue has been resolved" with
            # a double space) on 2026-05-21. The Fix A filter above
            # already drops the worst case (reason AND category empty);
            # this layer fills the remaining edge cases when only one
            # of the two is missing.
            category_label = category or 'health'
            duration_label = duration or 'unknown'
            data = {
                'hostname': self._hostname,
-                'category': category,
+                'category': category_label,
                'reason': clean_reason,
                'error_key': key,
                'severity': 'OK',
                'original_severity': original_severity,
                'first_seen': first_seen,
-                'duration': duration,
+                'duration': duration_label,
                'is_recovery': True,
            }
@@ -3243,23 +3262,7 @@ class PollingCollector:
        """
        now = time.time()
-        # Detect OFF→ON transition of the LXC update toggle. Without
+        if now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL:
        # this, the first polling cycle after service start always sets
        # the 24h gate — so a user who enables the toggle later (which
        # is the normal flow, since the toggle defaults to OFF) would
        # have to wait up to 24h or restart the service before the
        # detector ran. A one-shot bypass on the transition fixes that
        # without weakening the 24h cadence in steady state.
        try:
            import managed_installs as _mi
            lxc_enabled_now = _mi._lxc_updates_notification_enabled()
        except Exception:
            lxc_enabled_now = False
        lxc_just_enabled = lxc_enabled_now and not self._lxc_was_enabled
        self._lxc_was_enabled = lxc_enabled_now
        if (not lxc_just_enabled
                and now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL):
            return
        self._last_managed_check = now
@@ -3312,9 +3315,21 @@ class PollingCollector:
        # updates. The batch fingerprint is recomputed every cycle and
        # compared with the last notified one — if the set of CTs or
        # their per-CT fingerprints changed, we notify again.
-        if lxc_updates:
+        #
        # Detection itself runs unconditionally so the dashboard always
        # shows pending updates; the `lxc_updates_available` toggle only
        # controls whether a notification is *emitted*. If it's off we
        # skip the emit (and the dedup stamp) so re-enabling the toggle
        # later fires the next pending batch immediately.
        try:
            import managed_installs as _mi
            lxc_notif_enabled = _mi._lxc_updates_notification_enabled()
        except Exception:
            lxc_notif_enabled = False
        if lxc_updates and lxc_notif_enabled:
            self._emit_lxc_updates_batch(lxc_updates)
-        else:
+        elif not lxc_updates:
            # Empty batch — clear the dedup so a fresh batch later fires
            # a new notification even with the same CTs/versions.
            self._notified_lxc_batch = None
@@ -3579,7 +3594,27 @@ class PollingCollector:
            print(f"[PollingCollector] Failed to save known_errors meta: {e}")
    def _load_last_notified(self):
-        """Load per-error notification timestamps from DB on startup."""
+        """Load per-error notification timestamps from DB on startup.
        Reads only the per-key cooldown timestamps so the same-key
        24h gate survives a restart. **Does NOT touch `_known_errors`**
        — that snapshot is rebuilt exclusively by `_load_known_errors_meta`
        (which carries the full reason / category / severity payload
        needed to emit a meaningful recovery later).
        The original implementation also injected synthetic rows into
        `_known_errors` from this table, with `{'error_key': ek,
        'first_seen': <epoch int>}` and nothing else. That made the
        startup path believe the host had a populated baseline of
        active errors, so the first post-restart poll computed
        `resolved_keys = synthetic_dummies − current_keys` and emitted
        recovery notifications with empty `reason` / `category` /
        `severity` fields — the &ldquo;Resuelto -&rdquo; / &ldquo;Condición
        resuelta&rdquo; ghosts the user saw on 2026-05-21. Stale rows
        in this table never expire on their own (the bug was eternal:
        every restart re-triggered the ghost), so the fix is to never
        treat this table as a source of `_known_errors` content.
        """
        try:
            db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
            if not db_path.exists():
@@ -3594,8 +3629,6 @@ class PollingCollector:
            for fp, ts in cursor.fetchall():
                error_key = fp.replace('health_', '', 1)
                self._last_notified[error_key] = ts
                # _known_errors is a dict (not a set), store minimal metadata
                self._known_errors[error_key] = {'error_key': error_key, 'first_seen': ts}
            conn.close()
        except Exception as e:
            print(f"[PollingCollector] Failed to load last_notified: {e}")
@@ -484,6 +484,24 @@ AGGREGATION_RULES = {
 # burst, avoiding notification floods from any source.
 _DEFAULT_AGGREGATION = {'window': 60, 'min_count': 2, 'burst_type': 'burst_generic'}
 # Event types the burst aggregator must never group. The default
 # catch-all (`_DEFAULT_AGGREGATION`) treats anything unlisted as
 # group-able, which is the right default for *negative* signals
 # (failures, errors, intrusion attempts) but produces noise when
 # applied to positive / informational events the user wants to see
 # individually.
 #
 # Concrete failure mode that motivated this list: on 2026-05-21 a
 # post-restart resolved-detection batch emitted two `error_resolved`
 # events for two stale keys at the same time. The aggregator paired
 # them and the user received a useless "+1 error_resolved en 0s
 # (2 en total) — Eventos adicionales: Condición resuelta" burst on
 # top of the original recovery message. The signal value of a
 # recovery is per-event; collapsing them adds zero information.
 _AGGREGATION_EXEMPT_EVENTS = frozenset({
    'error_resolved',
 })
 class BurstAggregator:
    """Accumulates similar events in a time window, then sends a single summary.
@@ -517,7 +535,16 @@ class BurstAggregator:
        ALL event types are aggregated: specific rules from AGGREGATION_RULES
        take priority, otherwise the _DEFAULT_AGGREGATION catch-all applies.
        This prevents notification floods from any source.
        Exception: event types listed in `_AGGREGATION_EXEMPT_EVENTS`
        bypass aggregation entirely and are returned to the dispatcher
        as-is. Used for positive/informational events (recoveries,
        scheduled-task completions) where collapsing into a burst
        summary destroys signal value.
        """
        if event.event_type in _AGGREGATION_EXEMPT_EVENTS:
            return event
        rule = AGGREGATION_RULES.get(event.event_type, _DEFAULT_AGGREGATION)
        bucket_key = f"{event.event_type}:{event.data.get('hostname', '')}"
`@@ -1 +1 @@`
	`91da610577f6c7254db6941685d901afb0a5de228f1fcac02c4b6e2e72a63683 ProxMenux-1.2.1.2-beta.AppImage`	`1b72c977163192fba07cb6e18e8539d37c90e9624ff22e3ca2cc3c8a55ce8a8e ProxMenux-1.2.1.2-beta.AppImage`