mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-05-22 16:44:48 +00:00
Update AppImage 1..2.1.2
This commit is contained in:
@@ -381,10 +381,14 @@ def _detect_lxc_containers() -> list[dict]:
|
||||
a CT is seen. CT reinstalls with a different OS will keep the old
|
||||
family cached until the user resets the registry — acceptable
|
||||
trade-off vs paying the probe cost every 24h cycle.
|
||||
"""
|
||||
if not _lxc_updates_notification_enabled():
|
||||
return []
|
||||
|
||||
Detection runs unconditionally so the dashboard always reflects
|
||||
pending updates on running CTs. The `lxc_updates_available`
|
||||
notification toggle only gates the *delivery* of the notification
|
||||
(see _check_managed_installs_updates in notification_events.py),
|
||||
not the detection — that keeps the toggle semantics consistent with
|
||||
every other update stream (NVIDIA, Coral, post-install).
|
||||
"""
|
||||
# Read existing registry so we can preserve cached `_os_family`.
|
||||
# No lock needed here — we only inspect; the framework holds the
|
||||
# write lock when it merges back our results in detect_and_register.
|
||||
|
||||
@@ -2287,11 +2287,6 @@ class PollingCollector:
|
||||
# updates. The fingerprint encodes the per-CT state so a stable
|
||||
# batch doesn't re-notify while a meaningful change does.
|
||||
self._notified_lxc_batch: str | None = None
|
||||
# Track previous state of the LXC-updates notification toggle
|
||||
# so a user enabling it post-startup bypasses the 24h gate
|
||||
# ONCE — the next polling cycle runs a fresh detection without
|
||||
# waiting up to a day. Cleared after the forced run completes.
|
||||
self._lxc_was_enabled: bool = False
|
||||
# Track notified ProxMenux versions to avoid duplicates
|
||||
self._notified_proxmenux_version: str | None = None
|
||||
self._notified_proxmenux_beta_version: str | None = None
|
||||
@@ -2664,16 +2659,30 @@ class PollingCollector:
|
||||
category = old_meta.get('category', '')
|
||||
reason = old_meta.get('reason', '')
|
||||
first_seen = old_meta.get('first_seen', '')
|
||||
|
||||
|
||||
# Skip recovery for INFO/OK - they never triggered an alert
|
||||
if old_meta.get('severity', '') in ('INFO', 'OK'):
|
||||
self._last_notified.pop(key, None)
|
||||
continue
|
||||
|
||||
|
||||
# Skip recovery on first poll (we don't know what was before)
|
||||
if not self._first_poll_done:
|
||||
self._last_notified.pop(key, None)
|
||||
continue
|
||||
|
||||
# Skip recovery when the persisted snapshot lost the context
|
||||
# for this error (reason / category both empty). Emitting a
|
||||
# blank "Resuelto -" message with "Condition resolved" body
|
||||
# adds no value — the user can't tell which error went away.
|
||||
# Happens after a long Monitor downtime when the snapshot was
|
||||
# serialized between polls without reason/category populated,
|
||||
# or when a future code path slips a key into _known_errors
|
||||
# without the full metadata. Drop the tracking entry silently
|
||||
# so we never re-fire on the same incomplete record.
|
||||
if not reason and not category:
|
||||
self._last_notified.pop(key, None)
|
||||
self._notified_severity.pop(key, None)
|
||||
continue
|
||||
|
||||
# Skip recovery if the error was manually acknowledged (dismissed)
|
||||
# by the user. Acknowledged != resolved -- the problem may still
|
||||
@@ -2765,15 +2774,25 @@ class PollingCollector:
|
||||
original_severity = self._notified_severity.get(
|
||||
key, old_meta.get('severity', 'WARNING'),
|
||||
)
|
||||
# Defensive defaults — the template uses `{category}` and
|
||||
# `{duration}` in both the title and the body; an empty
|
||||
# `category` produced the cosmetic "Resolved - " with a
|
||||
# trailing dash (and "The issue has been resolved" with
|
||||
# a double space) on 2026-05-21. The Fix A filter above
|
||||
# already drops the worst case (reason AND category empty);
|
||||
# this layer fills the remaining edge cases when only one
|
||||
# of the two is missing.
|
||||
category_label = category or 'health'
|
||||
duration_label = duration or 'unknown'
|
||||
data = {
|
||||
'hostname': self._hostname,
|
||||
'category': category,
|
||||
'category': category_label,
|
||||
'reason': clean_reason,
|
||||
'error_key': key,
|
||||
'severity': 'OK',
|
||||
'original_severity': original_severity,
|
||||
'first_seen': first_seen,
|
||||
'duration': duration,
|
||||
'duration': duration_label,
|
||||
'is_recovery': True,
|
||||
}
|
||||
|
||||
@@ -3243,23 +3262,7 @@ class PollingCollector:
|
||||
"""
|
||||
now = time.time()
|
||||
|
||||
# Detect OFF→ON transition of the LXC update toggle. Without
|
||||
# this, the first polling cycle after service start always sets
|
||||
# the 24h gate — so a user who enables the toggle later (which
|
||||
# is the normal flow, since the toggle defaults to OFF) would
|
||||
# have to wait up to 24h or restart the service before the
|
||||
# detector ran. A one-shot bypass on the transition fixes that
|
||||
# without weakening the 24h cadence in steady state.
|
||||
try:
|
||||
import managed_installs as _mi
|
||||
lxc_enabled_now = _mi._lxc_updates_notification_enabled()
|
||||
except Exception:
|
||||
lxc_enabled_now = False
|
||||
lxc_just_enabled = lxc_enabled_now and not self._lxc_was_enabled
|
||||
self._lxc_was_enabled = lxc_enabled_now
|
||||
|
||||
if (not lxc_just_enabled
|
||||
and now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL):
|
||||
if now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL:
|
||||
return
|
||||
self._last_managed_check = now
|
||||
|
||||
@@ -3312,9 +3315,21 @@ class PollingCollector:
|
||||
# updates. The batch fingerprint is recomputed every cycle and
|
||||
# compared with the last notified one — if the set of CTs or
|
||||
# their per-CT fingerprints changed, we notify again.
|
||||
if lxc_updates:
|
||||
#
|
||||
# Detection itself runs unconditionally so the dashboard always
|
||||
# shows pending updates; the `lxc_updates_available` toggle only
|
||||
# controls whether a notification is *emitted*. If it's off we
|
||||
# skip the emit (and the dedup stamp) so re-enabling the toggle
|
||||
# later fires the next pending batch immediately.
|
||||
try:
|
||||
import managed_installs as _mi
|
||||
lxc_notif_enabled = _mi._lxc_updates_notification_enabled()
|
||||
except Exception:
|
||||
lxc_notif_enabled = False
|
||||
|
||||
if lxc_updates and lxc_notif_enabled:
|
||||
self._emit_lxc_updates_batch(lxc_updates)
|
||||
else:
|
||||
elif not lxc_updates:
|
||||
# Empty batch — clear the dedup so a fresh batch later fires
|
||||
# a new notification even with the same CTs/versions.
|
||||
self._notified_lxc_batch = None
|
||||
@@ -3579,7 +3594,27 @@ class PollingCollector:
|
||||
print(f"[PollingCollector] Failed to save known_errors meta: {e}")
|
||||
|
||||
def _load_last_notified(self):
|
||||
"""Load per-error notification timestamps from DB on startup."""
|
||||
"""Load per-error notification timestamps from DB on startup.
|
||||
|
||||
Reads only the per-key cooldown timestamps so the same-key
|
||||
24h gate survives a restart. **Does NOT touch `_known_errors`**
|
||||
— that snapshot is rebuilt exclusively by `_load_known_errors_meta`
|
||||
(which carries the full reason / category / severity payload
|
||||
needed to emit a meaningful recovery later).
|
||||
|
||||
The original implementation also injected synthetic rows into
|
||||
`_known_errors` from this table, with `{'error_key': ek,
|
||||
'first_seen': <epoch int>}` and nothing else. That made the
|
||||
startup path believe the host had a populated baseline of
|
||||
active errors, so the first post-restart poll computed
|
||||
`resolved_keys = synthetic_dummies − current_keys` and emitted
|
||||
recovery notifications with empty `reason` / `category` /
|
||||
`severity` fields — the “Resuelto -” / “Condición
|
||||
resuelta” ghosts the user saw on 2026-05-21. Stale rows
|
||||
in this table never expire on their own (the bug was eternal:
|
||||
every restart re-triggered the ghost), so the fix is to never
|
||||
treat this table as a source of `_known_errors` content.
|
||||
"""
|
||||
try:
|
||||
db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
|
||||
if not db_path.exists():
|
||||
@@ -3594,8 +3629,6 @@ class PollingCollector:
|
||||
for fp, ts in cursor.fetchall():
|
||||
error_key = fp.replace('health_', '', 1)
|
||||
self._last_notified[error_key] = ts
|
||||
# _known_errors is a dict (not a set), store minimal metadata
|
||||
self._known_errors[error_key] = {'error_key': error_key, 'first_seen': ts}
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"[PollingCollector] Failed to load last_notified: {e}")
|
||||
|
||||
@@ -484,6 +484,24 @@ AGGREGATION_RULES = {
|
||||
# burst, avoiding notification floods from any source.
|
||||
_DEFAULT_AGGREGATION = {'window': 60, 'min_count': 2, 'burst_type': 'burst_generic'}
|
||||
|
||||
# Event types the burst aggregator must never group. The default
|
||||
# catch-all (`_DEFAULT_AGGREGATION`) treats anything unlisted as
|
||||
# group-able, which is the right default for *negative* signals
|
||||
# (failures, errors, intrusion attempts) but produces noise when
|
||||
# applied to positive / informational events the user wants to see
|
||||
# individually.
|
||||
#
|
||||
# Concrete failure mode that motivated this list: on 2026-05-21 a
|
||||
# post-restart resolved-detection batch emitted two `error_resolved`
|
||||
# events for two stale keys at the same time. The aggregator paired
|
||||
# them and the user received a useless "+1 error_resolved en 0s
|
||||
# (2 en total) — Eventos adicionales: Condición resuelta" burst on
|
||||
# top of the original recovery message. The signal value of a
|
||||
# recovery is per-event; collapsing them adds zero information.
|
||||
_AGGREGATION_EXEMPT_EVENTS = frozenset({
|
||||
'error_resolved',
|
||||
})
|
||||
|
||||
|
||||
class BurstAggregator:
|
||||
"""Accumulates similar events in a time window, then sends a single summary.
|
||||
@@ -517,7 +535,16 @@ class BurstAggregator:
|
||||
ALL event types are aggregated: specific rules from AGGREGATION_RULES
|
||||
take priority, otherwise the _DEFAULT_AGGREGATION catch-all applies.
|
||||
This prevents notification floods from any source.
|
||||
|
||||
Exception: event types listed in `_AGGREGATION_EXEMPT_EVENTS`
|
||||
bypass aggregation entirely and are returned to the dispatcher
|
||||
as-is. Used for positive/informational events (recoveries,
|
||||
scheduled-task completions) where collapsing into a burst
|
||||
summary destroys signal value.
|
||||
"""
|
||||
if event.event_type in _AGGREGATION_EXEMPT_EVENTS:
|
||||
return event
|
||||
|
||||
rule = AGGREGATION_RULES.get(event.event_type, _DEFAULT_AGGREGATION)
|
||||
|
||||
bucket_key = f"{event.event_type}:{event.data.get('hostname', '')}"
|
||||
|
||||
Reference in New Issue
Block a user