mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-05-22 16:44:48 +00:00
Update AppImage 1..2.1.2
This commit is contained in:
Binary file not shown.
@@ -1 +1 @@
|
|||||||
91da610577f6c7254db6941685d901afb0a5de228f1fcac02c4b6e2e72a63683 ProxMenux-1.2.1.2-beta.AppImage
|
1b72c977163192fba07cb6e18e8539d37c90e9624ff22e3ca2cc3c8a55ce8a8e ProxMenux-1.2.1.2-beta.AppImage
|
||||||
|
|||||||
@@ -381,10 +381,14 @@ def _detect_lxc_containers() -> list[dict]:
|
|||||||
a CT is seen. CT reinstalls with a different OS will keep the old
|
a CT is seen. CT reinstalls with a different OS will keep the old
|
||||||
family cached until the user resets the registry — acceptable
|
family cached until the user resets the registry — acceptable
|
||||||
trade-off vs paying the probe cost every 24h cycle.
|
trade-off vs paying the probe cost every 24h cycle.
|
||||||
"""
|
|
||||||
if not _lxc_updates_notification_enabled():
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
Detection runs unconditionally so the dashboard always reflects
|
||||||
|
pending updates on running CTs. The `lxc_updates_available`
|
||||||
|
notification toggle only gates the *delivery* of the notification
|
||||||
|
(see _check_managed_installs_updates in notification_events.py),
|
||||||
|
not the detection — that keeps the toggle semantics consistent with
|
||||||
|
every other update stream (NVIDIA, Coral, post-install).
|
||||||
|
"""
|
||||||
# Read existing registry so we can preserve cached `_os_family`.
|
# Read existing registry so we can preserve cached `_os_family`.
|
||||||
# No lock needed here — we only inspect; the framework holds the
|
# No lock needed here — we only inspect; the framework holds the
|
||||||
# write lock when it merges back our results in detect_and_register.
|
# write lock when it merges back our results in detect_and_register.
|
||||||
|
|||||||
@@ -2287,11 +2287,6 @@ class PollingCollector:
|
|||||||
# updates. The fingerprint encodes the per-CT state so a stable
|
# updates. The fingerprint encodes the per-CT state so a stable
|
||||||
# batch doesn't re-notify while a meaningful change does.
|
# batch doesn't re-notify while a meaningful change does.
|
||||||
self._notified_lxc_batch: str | None = None
|
self._notified_lxc_batch: str | None = None
|
||||||
# Track previous state of the LXC-updates notification toggle
|
|
||||||
# so a user enabling it post-startup bypasses the 24h gate
|
|
||||||
# ONCE — the next polling cycle runs a fresh detection without
|
|
||||||
# waiting up to a day. Cleared after the forced run completes.
|
|
||||||
self._lxc_was_enabled: bool = False
|
|
||||||
# Track notified ProxMenux versions to avoid duplicates
|
# Track notified ProxMenux versions to avoid duplicates
|
||||||
self._notified_proxmenux_version: str | None = None
|
self._notified_proxmenux_version: str | None = None
|
||||||
self._notified_proxmenux_beta_version: str | None = None
|
self._notified_proxmenux_beta_version: str | None = None
|
||||||
@@ -2675,6 +2670,20 @@ class PollingCollector:
|
|||||||
self._last_notified.pop(key, None)
|
self._last_notified.pop(key, None)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Skip recovery when the persisted snapshot lost the context
|
||||||
|
# for this error (reason / category both empty). Emitting a
|
||||||
|
# blank "Resuelto -" message with "Condition resolved" body
|
||||||
|
# adds no value — the user can't tell which error went away.
|
||||||
|
# Happens after a long Monitor downtime when the snapshot was
|
||||||
|
# serialized between polls without reason/category populated,
|
||||||
|
# or when a future code path slips a key into _known_errors
|
||||||
|
# without the full metadata. Drop the tracking entry silently
|
||||||
|
# so we never re-fire on the same incomplete record.
|
||||||
|
if not reason and not category:
|
||||||
|
self._last_notified.pop(key, None)
|
||||||
|
self._notified_severity.pop(key, None)
|
||||||
|
continue
|
||||||
|
|
||||||
# Skip recovery if the error was manually acknowledged (dismissed)
|
# Skip recovery if the error was manually acknowledged (dismissed)
|
||||||
# by the user. Acknowledged != resolved -- the problem may still
|
# by the user. Acknowledged != resolved -- the problem may still
|
||||||
# exist, the user just chose to suppress notifications for it.
|
# exist, the user just chose to suppress notifications for it.
|
||||||
@@ -2765,15 +2774,25 @@ class PollingCollector:
|
|||||||
original_severity = self._notified_severity.get(
|
original_severity = self._notified_severity.get(
|
||||||
key, old_meta.get('severity', 'WARNING'),
|
key, old_meta.get('severity', 'WARNING'),
|
||||||
)
|
)
|
||||||
|
# Defensive defaults — the template uses `{category}` and
|
||||||
|
# `{duration}` in both the title and the body; an empty
|
||||||
|
# `category` produced the cosmetic "Resolved - " with a
|
||||||
|
# trailing dash (and "The issue has been resolved" with
|
||||||
|
# a double space) on 2026-05-21. The Fix A filter above
|
||||||
|
# already drops the worst case (reason AND category empty);
|
||||||
|
# this layer fills the remaining edge cases when only one
|
||||||
|
# of the two is missing.
|
||||||
|
category_label = category or 'health'
|
||||||
|
duration_label = duration or 'unknown'
|
||||||
data = {
|
data = {
|
||||||
'hostname': self._hostname,
|
'hostname': self._hostname,
|
||||||
'category': category,
|
'category': category_label,
|
||||||
'reason': clean_reason,
|
'reason': clean_reason,
|
||||||
'error_key': key,
|
'error_key': key,
|
||||||
'severity': 'OK',
|
'severity': 'OK',
|
||||||
'original_severity': original_severity,
|
'original_severity': original_severity,
|
||||||
'first_seen': first_seen,
|
'first_seen': first_seen,
|
||||||
'duration': duration,
|
'duration': duration_label,
|
||||||
'is_recovery': True,
|
'is_recovery': True,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3243,23 +3262,7 @@ class PollingCollector:
|
|||||||
"""
|
"""
|
||||||
now = time.time()
|
now = time.time()
|
||||||
|
|
||||||
# Detect OFF→ON transition of the LXC update toggle. Without
|
if now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL:
|
||||||
# this, the first polling cycle after service start always sets
|
|
||||||
# the 24h gate — so a user who enables the toggle later (which
|
|
||||||
# is the normal flow, since the toggle defaults to OFF) would
|
|
||||||
# have to wait up to 24h or restart the service before the
|
|
||||||
# detector ran. A one-shot bypass on the transition fixes that
|
|
||||||
# without weakening the 24h cadence in steady state.
|
|
||||||
try:
|
|
||||||
import managed_installs as _mi
|
|
||||||
lxc_enabled_now = _mi._lxc_updates_notification_enabled()
|
|
||||||
except Exception:
|
|
||||||
lxc_enabled_now = False
|
|
||||||
lxc_just_enabled = lxc_enabled_now and not self._lxc_was_enabled
|
|
||||||
self._lxc_was_enabled = lxc_enabled_now
|
|
||||||
|
|
||||||
if (not lxc_just_enabled
|
|
||||||
and now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL):
|
|
||||||
return
|
return
|
||||||
self._last_managed_check = now
|
self._last_managed_check = now
|
||||||
|
|
||||||
@@ -3312,9 +3315,21 @@ class PollingCollector:
|
|||||||
# updates. The batch fingerprint is recomputed every cycle and
|
# updates. The batch fingerprint is recomputed every cycle and
|
||||||
# compared with the last notified one — if the set of CTs or
|
# compared with the last notified one — if the set of CTs or
|
||||||
# their per-CT fingerprints changed, we notify again.
|
# their per-CT fingerprints changed, we notify again.
|
||||||
if lxc_updates:
|
#
|
||||||
|
# Detection itself runs unconditionally so the dashboard always
|
||||||
|
# shows pending updates; the `lxc_updates_available` toggle only
|
||||||
|
# controls whether a notification is *emitted*. If it's off we
|
||||||
|
# skip the emit (and the dedup stamp) so re-enabling the toggle
|
||||||
|
# later fires the next pending batch immediately.
|
||||||
|
try:
|
||||||
|
import managed_installs as _mi
|
||||||
|
lxc_notif_enabled = _mi._lxc_updates_notification_enabled()
|
||||||
|
except Exception:
|
||||||
|
lxc_notif_enabled = False
|
||||||
|
|
||||||
|
if lxc_updates and lxc_notif_enabled:
|
||||||
self._emit_lxc_updates_batch(lxc_updates)
|
self._emit_lxc_updates_batch(lxc_updates)
|
||||||
else:
|
elif not lxc_updates:
|
||||||
# Empty batch — clear the dedup so a fresh batch later fires
|
# Empty batch — clear the dedup so a fresh batch later fires
|
||||||
# a new notification even with the same CTs/versions.
|
# a new notification even with the same CTs/versions.
|
||||||
self._notified_lxc_batch = None
|
self._notified_lxc_batch = None
|
||||||
@@ -3579,7 +3594,27 @@ class PollingCollector:
|
|||||||
print(f"[PollingCollector] Failed to save known_errors meta: {e}")
|
print(f"[PollingCollector] Failed to save known_errors meta: {e}")
|
||||||
|
|
||||||
def _load_last_notified(self):
|
def _load_last_notified(self):
|
||||||
"""Load per-error notification timestamps from DB on startup."""
|
"""Load per-error notification timestamps from DB on startup.
|
||||||
|
|
||||||
|
Reads only the per-key cooldown timestamps so the same-key
|
||||||
|
24h gate survives a restart. **Does NOT touch `_known_errors`**
|
||||||
|
— that snapshot is rebuilt exclusively by `_load_known_errors_meta`
|
||||||
|
(which carries the full reason / category / severity payload
|
||||||
|
needed to emit a meaningful recovery later).
|
||||||
|
|
||||||
|
The original implementation also injected synthetic rows into
|
||||||
|
`_known_errors` from this table, with `{'error_key': ek,
|
||||||
|
'first_seen': <epoch int>}` and nothing else. That made the
|
||||||
|
startup path believe the host had a populated baseline of
|
||||||
|
active errors, so the first post-restart poll computed
|
||||||
|
`resolved_keys = synthetic_dummies − current_keys` and emitted
|
||||||
|
recovery notifications with empty `reason` / `category` /
|
||||||
|
`severity` fields — the “Resuelto -” / “Condición
|
||||||
|
resuelta” ghosts the user saw on 2026-05-21. Stale rows
|
||||||
|
in this table never expire on their own (the bug was eternal:
|
||||||
|
every restart re-triggered the ghost), so the fix is to never
|
||||||
|
treat this table as a source of `_known_errors` content.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
|
db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
|
||||||
if not db_path.exists():
|
if not db_path.exists():
|
||||||
@@ -3594,8 +3629,6 @@ class PollingCollector:
|
|||||||
for fp, ts in cursor.fetchall():
|
for fp, ts in cursor.fetchall():
|
||||||
error_key = fp.replace('health_', '', 1)
|
error_key = fp.replace('health_', '', 1)
|
||||||
self._last_notified[error_key] = ts
|
self._last_notified[error_key] = ts
|
||||||
# _known_errors is a dict (not a set), store minimal metadata
|
|
||||||
self._known_errors[error_key] = {'error_key': error_key, 'first_seen': ts}
|
|
||||||
conn.close()
|
conn.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[PollingCollector] Failed to load last_notified: {e}")
|
print(f"[PollingCollector] Failed to load last_notified: {e}")
|
||||||
|
|||||||
@@ -484,6 +484,24 @@ AGGREGATION_RULES = {
|
|||||||
# burst, avoiding notification floods from any source.
|
# burst, avoiding notification floods from any source.
|
||||||
_DEFAULT_AGGREGATION = {'window': 60, 'min_count': 2, 'burst_type': 'burst_generic'}
|
_DEFAULT_AGGREGATION = {'window': 60, 'min_count': 2, 'burst_type': 'burst_generic'}
|
||||||
|
|
||||||
|
# Event types the burst aggregator must never group. The default
|
||||||
|
# catch-all (`_DEFAULT_AGGREGATION`) treats anything unlisted as
|
||||||
|
# group-able, which is the right default for *negative* signals
|
||||||
|
# (failures, errors, intrusion attempts) but produces noise when
|
||||||
|
# applied to positive / informational events the user wants to see
|
||||||
|
# individually.
|
||||||
|
#
|
||||||
|
# Concrete failure mode that motivated this list: on 2026-05-21 a
|
||||||
|
# post-restart resolved-detection batch emitted two `error_resolved`
|
||||||
|
# events for two stale keys at the same time. The aggregator paired
|
||||||
|
# them and the user received a useless "+1 error_resolved en 0s
|
||||||
|
# (2 en total) — Eventos adicionales: Condición resuelta" burst on
|
||||||
|
# top of the original recovery message. The signal value of a
|
||||||
|
# recovery is per-event; collapsing them adds zero information.
|
||||||
|
_AGGREGATION_EXEMPT_EVENTS = frozenset({
|
||||||
|
'error_resolved',
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
class BurstAggregator:
|
class BurstAggregator:
|
||||||
"""Accumulates similar events in a time window, then sends a single summary.
|
"""Accumulates similar events in a time window, then sends a single summary.
|
||||||
@@ -517,7 +535,16 @@ class BurstAggregator:
|
|||||||
ALL event types are aggregated: specific rules from AGGREGATION_RULES
|
ALL event types are aggregated: specific rules from AGGREGATION_RULES
|
||||||
take priority, otherwise the _DEFAULT_AGGREGATION catch-all applies.
|
take priority, otherwise the _DEFAULT_AGGREGATION catch-all applies.
|
||||||
This prevents notification floods from any source.
|
This prevents notification floods from any source.
|
||||||
|
|
||||||
|
Exception: event types listed in `_AGGREGATION_EXEMPT_EVENTS`
|
||||||
|
bypass aggregation entirely and are returned to the dispatcher
|
||||||
|
as-is. Used for positive/informational events (recoveries,
|
||||||
|
scheduled-task completions) where collapsing into a burst
|
||||||
|
summary destroys signal value.
|
||||||
"""
|
"""
|
||||||
|
if event.event_type in _AGGREGATION_EXEMPT_EVENTS:
|
||||||
|
return event
|
||||||
|
|
||||||
rule = AGGREGATION_RULES.get(event.event_type, _DEFAULT_AGGREGATION)
|
rule = AGGREGATION_RULES.get(event.event_type, _DEFAULT_AGGREGATION)
|
||||||
|
|
||||||
bucket_key = f"{event.event_type}:{event.data.get('hostname', '')}"
|
bucket_key = f"{event.event_type}:{event.data.get('hostname', '')}"
|
||||||
|
|||||||
Reference in New Issue
Block a user