Update notification service

This commit is contained in:
MacRimi
2026-03-04 19:11:38 +01:00
parent 66d2a68167
commit 9089035f18
5 changed files with 504 additions and 36 deletions

View File

@@ -1418,6 +1418,34 @@ def get_storage_info():
# print(f"Error getting partition info: {e}")
pass
# ── Register disks in observation system + enrich with observation counts ──
try:
active_dev_names = list(physical_disks.keys())
obs_counts = health_persistence.get_disks_observation_counts()
for disk_name, disk_info in physical_disks.items():
# Register each disk we see
health_persistence.register_disk(
device_name=disk_name,
serial=disk_info.get('serial', ''),
model=disk_info.get('model', ''),
size_bytes=disk_info.get('size_bytes'),
)
# Attach observation count: try serial match first, then device name
serial = disk_info.get('serial', '')
count = obs_counts.get(f'serial:{serial}', 0) if serial else 0
if count == 0:
count = obs_counts.get(disk_name, 0)
disk_info['observations_count'] = count
# Mark disks no longer present as removed
health_persistence.mark_removed_disks(active_dev_names)
# Auto-dismiss stale observations (> 30 days old)
health_persistence.cleanup_stale_observations()
except Exception:
pass
storage_data['disks'] = list(physical_disks.values())
return storage_data

View File

@@ -135,19 +135,22 @@ class HealthMonitor:
# These are logged at ERR level but are common on SATA controllers
# during hot-plug, link renegotiation, or cable noise. They are NOT
# indicative of disk failure unless SMART also reports problems.
r'ata\d+.*SError.*BadCRC',
r'ata\d+.*Emask 0x10.*ATA bus error',
r'failed command: (READ|WRITE) FPDMA QUEUED',
# NOTE: patterns are matched against line.lower(), so use lowercase.
r'ata\d+.*serror.*badcrc',
r'ata\d+.*emask 0x10.*ata bus error',
r'failed command: (read|write) fpdma queued',
r'ata\d+.*hard resetting link',
r'ata\d+.*link is slow',
r'ata\d+.*COMRESET',
r'ata\d+.*comreset',
# ── ProxMenux self-referential noise ──
# The monitor reporting its OWN service failures is circular --
# it cannot meaningfully alert about itself.
r'proxmenux-monitor\.service.*Failed',
# NOTE: patterns are matched against line.lower(), so use lowercase.
r'proxmenux-monitor\.service.*failed',
r'proxmenux-monitor\.service.*exit-code',
r'ProxMenux-Monitor.*Failed at step EXEC',
r'proxmenux-monitor.*failed at step exec',
r'proxmenux-monitor\.appimage',
# ── PVE scheduler operational noise ──
# pvescheduler emits "could not update job state" every minute
@@ -1147,6 +1150,42 @@ class HealthMonitor:
return storages
@staticmethod
def _make_io_obs_signature(disk: str, sample: str) -> str:
"""Create a stable observation signature for I/O errors on a disk.
All ATA errors on the same disk (exception Emask, revalidation failed,
hard resetting link, SError, etc.) map to ONE signature per error family.
This ensures that "Emask 0x1 SAct 0xc1000000" and "Emask 0x1 SAct 0x804000"
and "revalidation failed" all dedup into the same observation.
"""
if not sample:
return f'io_{disk}_generic'
s = sample.lower()
# Classify into error families (order matters: first match wins)
families = [
# ATA controller errors: exception, emask, revalidation, reset
# All these are symptoms of the same underlying connection issue
(r'exception\s+emask|emask\s+0x|revalidation failed|hard resetting link|'
r'serror.*badcrc|comreset|link is slow|status.*drdy',
'ata_connection_error'),
# SCSI / block-layer errors
(r'i/o error|blk_update_request|medium error|sense key',
'block_io_error'),
# Failed commands (READ/WRITE FPDMA QUEUED)
(r'failed command|fpdma queued',
'ata_failed_command'),
]
for pattern, family in families:
if re.search(pattern, s):
return f'io_{disk}_{family}'
# Fallback: generic per-disk
return f'io_{disk}_generic'
def _resolve_ata_to_disk(self, ata_port: str) -> str:
"""Resolve an ATA controller name (e.g. 'ata8') to a block device (e.g. 'sda').
@@ -1444,6 +1483,26 @@ class HealthMonitor:
smart_ok = smart_health == 'PASSED'
# ── Record disk observation (always, even if transient) ──
# Signature must be stable across cycles: strip volatile
# data (hex values, counts, timestamps) to dedup properly.
# e.g. "ata8.00: exception Emask 0x1 SAct 0xc1000000"
# and "ata8.00: revalidation failed (errno=-2)"
# both map to the same per-device I/O observation.
try:
obs_sig = self._make_io_obs_signature(disk, sample)
obs_severity = 'critical' if smart_health == 'FAILED' else 'warning'
health_persistence.record_disk_observation(
device_name=disk,
serial=None,
error_type='io_error',
error_signature=obs_sig,
raw_message=f'{display}: {error_count} I/O event(s) in 5 min (SMART: {smart_health})\n{sample}',
severity=obs_severity,
)
except Exception:
pass
# Transient-only errors (e.g. SError with auto-recovery)
# are always INFO regardless of SMART
if all_transient:

View File

@@ -150,6 +150,45 @@ class HealthPersistence:
cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_severity ON notification_history(severity)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_nls_ts ON notification_last_sent(last_sent_ts)')
# ── Disk Observations System ──
# Registry of all physical disks seen by the system
cursor.execute('''
CREATE TABLE IF NOT EXISTS disk_registry (
id INTEGER PRIMARY KEY AUTOINCREMENT,
device_name TEXT NOT NULL,
serial TEXT,
model TEXT,
size_bytes INTEGER,
first_seen TEXT NOT NULL,
last_seen TEXT NOT NULL,
removed INTEGER DEFAULT 0,
UNIQUE(device_name, serial)
)
''')
# Observation log: deduplicated error events per disk
cursor.execute('''
CREATE TABLE IF NOT EXISTS disk_observations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
disk_registry_id INTEGER NOT NULL,
error_type TEXT NOT NULL,
error_signature TEXT NOT NULL,
first_occurrence TEXT NOT NULL,
last_occurrence TEXT NOT NULL,
occurrence_count INTEGER DEFAULT 1,
raw_message TEXT,
severity TEXT DEFAULT 'warning',
dismissed INTEGER DEFAULT 0,
FOREIGN KEY(disk_registry_id) REFERENCES disk_registry(id),
UNIQUE(disk_registry_id, error_type, error_signature)
)
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_disk_serial ON disk_registry(serial)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_disk_device ON disk_registry(device_name)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_disk ON disk_observations(disk_registry_id)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_dismissed ON disk_observations(dismissed)')
conn.commit()
conn.close()
@@ -519,10 +558,12 @@ class HealthPersistence:
}
child_prefix = CASCADE_PREFIXES.get(error_key)
if child_prefix:
# Only cascade to active (unresolved) child errors.
# Already-resolved/expired entries must NOT be re-surfaced.
cursor.execute('''
UPDATE errors
SET acknowledged = 1, resolved_at = ?, suppression_hours = ?
WHERE error_key LIKE ? AND acknowledged = 0
WHERE error_key LIKE ? AND acknowledged = 0 AND resolved_at IS NULL
''', (now, sup_hours, child_prefix + '%'))
result = {
@@ -1119,5 +1160,225 @@ class HealthPersistence:
print(f"[HealthPersistence] Error recording UNKNOWN persistent: {e}")
# ────────────────────────────────────────────────────────────────
# Disk Observations API
# ────────────────────────────────────────────────────────────────
def register_disk(self, device_name: str, serial: Optional[str] = None,
model: Optional[str] = None, size_bytes: Optional[int] = None):
"""Register or update a physical disk in the registry.
Uses (device_name, serial) as unique key. If the disk was previously
marked removed, it's re-activated.
"""
now = datetime.now().isoformat()
try:
conn = self._get_conn()
cursor = conn.cursor()
cursor.execute('''
INSERT INTO disk_registry (device_name, serial, model, size_bytes, first_seen, last_seen, removed)
VALUES (?, ?, ?, ?, ?, ?, 0)
ON CONFLICT(device_name, serial) DO UPDATE SET
model = COALESCE(excluded.model, model),
size_bytes = COALESCE(excluded.size_bytes, size_bytes),
last_seen = excluded.last_seen,
removed = 0
''', (device_name, serial or '', model, size_bytes, now, now))
conn.commit()
conn.close()
except Exception as e:
print(f"[HealthPersistence] Error registering disk {device_name}: {e}")
def _get_disk_registry_id(self, cursor, device_name: str,
serial: Optional[str] = None) -> Optional[int]:
"""Find disk_registry.id, matching by serial first, then device_name."""
if serial:
cursor.execute(
'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1',
(serial,))
row = cursor.fetchone()
if row:
return row[0]
# Fallback: match by device_name (strip /dev/ prefix)
clean_dev = device_name.replace('/dev/', '')
cursor.execute(
'SELECT id FROM disk_registry WHERE device_name = ? ORDER BY last_seen DESC LIMIT 1',
(clean_dev,))
row = cursor.fetchone()
return row[0] if row else None
def record_disk_observation(self, device_name: str, serial: Optional[str],
error_type: str, error_signature: str,
raw_message: str = '',
severity: str = 'warning'):
"""Record or deduplicate a disk error observation.
error_type: 'smart_error', 'io_error', 'connection_error'
error_signature: Normalized unique string for dedup (e.g. 'FailedReadSmartSelfTestLog')
"""
now = datetime.now().isoformat()
try:
conn = self._get_conn()
cursor = conn.cursor()
# Auto-register the disk if not present
clean_dev = device_name.replace('/dev/', '')
self.register_disk(clean_dev, serial)
disk_id = self._get_disk_registry_id(cursor, clean_dev, serial)
if not disk_id:
conn.close()
return
# Upsert observation: if same (disk, type, signature), bump count + update last_occurrence
cursor.execute('''
INSERT INTO disk_observations
(disk_registry_id, error_type, error_signature, first_occurrence,
last_occurrence, occurrence_count, raw_message, severity, dismissed)
VALUES (?, ?, ?, ?, ?, 1, ?, ?, 0)
ON CONFLICT(disk_registry_id, error_type, error_signature) DO UPDATE SET
last_occurrence = excluded.last_occurrence,
occurrence_count = occurrence_count + 1,
severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END,
dismissed = 0
''', (disk_id, error_type, error_signature, now, now, raw_message, severity))
conn.commit()
conn.close()
except Exception as e:
print(f"[HealthPersistence] Error recording disk observation: {e}")
def get_disk_observations(self, device_name: Optional[str] = None,
serial: Optional[str] = None) -> List[Dict[str, Any]]:
"""Get active (non-dismissed) observations for one disk or all disks."""
try:
conn = self._get_conn()
cursor = conn.cursor()
if device_name or serial:
disk_id = self._get_disk_registry_id(cursor,
device_name or '', serial)
if not disk_id:
conn.close()
return []
cursor.execute('''
SELECT o.id, o.error_type, o.error_signature,
o.first_occurrence, o.last_occurrence,
o.occurrence_count, o.raw_message, o.severity, o.dismissed,
d.device_name, d.serial, d.model
FROM disk_observations o
JOIN disk_registry d ON o.disk_registry_id = d.id
WHERE o.disk_registry_id = ? AND o.dismissed = 0
ORDER BY o.last_occurrence DESC
''', (disk_id,))
else:
cursor.execute('''
SELECT o.id, o.error_type, o.error_signature,
o.first_occurrence, o.last_occurrence,
o.occurrence_count, o.raw_message, o.severity, o.dismissed,
d.device_name, d.serial, d.model
FROM disk_observations o
JOIN disk_registry d ON o.disk_registry_id = d.id
WHERE o.dismissed = 0
ORDER BY o.last_occurrence DESC
''')
rows = cursor.fetchall()
conn.close()
return [{
'id': r[0],
'error_type': r[1],
'error_signature': r[2],
'first_occurrence': r[3],
'last_occurrence': r[4],
'occurrence_count': r[5],
'raw_message': r[6] or '',
'severity': r[7],
'dismissed': bool(r[8]),
'device_name': r[9],
'serial': r[10],
'model': r[11],
} for r in rows]
except Exception as e:
print(f"[HealthPersistence] Error getting observations: {e}")
return []
def get_disks_observation_counts(self) -> Dict[str, int]:
"""Return {device_name: count} of active observations per disk.
Also includes serial-keyed entries for cross-device matching.
"""
try:
conn = self._get_conn()
cursor = conn.cursor()
cursor.execute('''
SELECT d.device_name, d.serial, COUNT(o.id) as cnt
FROM disk_observations o
JOIN disk_registry d ON o.disk_registry_id = d.id
WHERE o.dismissed = 0
GROUP BY d.id
''')
result = {}
for device_name, serial, cnt in cursor.fetchall():
result[device_name] = cnt
if serial:
result[f'serial:{serial}'] = cnt
conn.close()
return result
except Exception as e:
print(f"[HealthPersistence] Error getting observation counts: {e}")
return {}
def dismiss_disk_observation(self, observation_id: int):
"""Mark a single observation as dismissed."""
try:
conn = self._get_conn()
cursor = conn.cursor()
cursor.execute(
'UPDATE disk_observations SET dismissed = 1 WHERE id = ?',
(observation_id,))
conn.commit()
conn.close()
except Exception as e:
print(f"[HealthPersistence] Error dismissing observation: {e}")
def cleanup_stale_observations(self, max_age_days: int = 30):
"""Auto-dismiss observations not seen in max_age_days."""
try:
from datetime import timedelta
cutoff = (datetime.now() - timedelta(days=max_age_days)).isoformat()
conn = self._get_conn()
cursor = conn.cursor()
cursor.execute('''
UPDATE disk_observations
SET dismissed = 1
WHERE dismissed = 0 AND last_occurrence < ?
''', (cutoff,))
conn.commit()
conn.close()
except Exception as e:
print(f"[HealthPersistence] Error cleaning stale observations: {e}")
def mark_removed_disks(self, active_device_names: List[str]):
"""Mark disks not in active_device_names as removed."""
try:
now = datetime.now().isoformat()
conn = self._get_conn()
cursor = conn.cursor()
if active_device_names:
placeholders = ','.join('?' for _ in active_device_names)
cursor.execute(f'''
UPDATE disk_registry SET removed = 1
WHERE device_name NOT IN ({placeholders}) AND removed = 0
''', active_device_names)
conn.commit()
conn.close()
except Exception as e:
print(f"[HealthPersistence] Error marking removed disks: {e}")
# Global instance
health_persistence = HealthPersistence()

View File

@@ -690,6 +690,68 @@ class JournalWatcher:
except Exception:
return 'UNKNOWN'
def _record_smartd_observation(self, title: str, message: str):
"""Extract device info from a smartd system-mail and record as disk observation."""
try:
import re as _re
from health_persistence import health_persistence
# Extract device path: "Device: /dev/sdh [SAT]" or "Device: /dev/sda"
dev_match = _re.search(r'Device:\s*/dev/(\S+?)[\s\[\],]', message)
device = dev_match.group(1) if dev_match else ''
if not device:
return
# Strip partition suffix and SAT prefix
base_dev = _re.sub(r'\d+$', '', device)
# Extract serial: "S/N:WD-WX72A30AA72R"
sn_match = _re.search(r'S/N:\s*(\S+)', message)
serial = sn_match.group(1) if sn_match else ''
# Extract model: appears before S/N on the "Device info:" line
model = ''
model_match = _re.search(r'Device info:\s*\n?\s*(.+?)(?:,\s*S/N:)', message)
if model_match:
model = model_match.group(1).strip()
# Extract error signature from title: "SMART error (FailedReadSmartSelfTestLog)"
sig_match = _re.search(r'SMART error\s*\((\w+)\)', title)
if sig_match:
error_signature = sig_match.group(1)
error_type = 'smart_error'
else:
# Fallback: extract the "warning/error logged" line
warn_match = _re.search(
r'warning/error was logged.*?:\s*\n?\s*(.+)', message, _re.IGNORECASE)
if warn_match:
error_signature = _re.sub(r'[^a-zA-Z0-9_]', '_',
warn_match.group(1).strip())[:80]
else:
error_signature = _re.sub(r'[^a-zA-Z0-9_]', '_', title)[:80]
error_type = 'smart_error'
# Build a clean raw_message for display
raw_msg = f"Device: /dev/{base_dev}"
if model:
raw_msg += f" ({model})"
if serial:
raw_msg += f" S/N:{serial}"
warn_line_m = _re.search(
r'The following warning/error.*?:\s*\n?\s*(.+)', message, _re.IGNORECASE)
if warn_line_m:
raw_msg += f"\n{warn_line_m.group(1).strip()}"
health_persistence.record_disk_observation(
device_name=base_dev,
serial=serial,
error_type=error_type,
error_signature=error_signature,
raw_message=raw_msg,
severity='warning',
)
except Exception as e:
print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}")
@staticmethod
def _translate_ata_error(msg: str) -> str:
"""Translate common ATA/SCSI error codes to human-readable descriptions."""
@@ -1393,15 +1455,42 @@ class PollingCollector:
Tracking is stored in ``notification_last_sent`` (same DB).
"""
DIGEST_INTERVAL = 86400 # 24 h between re-notifications
DIGEST_INTERVAL = 86400 # 24 h default between re-notifications
UPDATE_CHECK_INTERVAL = 86400 # 24 h between update scans
NEW_ERROR_WINDOW = 120 # seconds errors younger than this are "new"
# Per-category anti-oscillation cooldowns (seconds).
# When an error resolves briefly and reappears, we still respect this
# interval before notifying again. This prevents "semi-cascades" where
# the same root cause generates many slightly different notifications.
#
# Key = health_persistence category name
# Value = minimum seconds between notifications for the same error_key
_CATEGORY_COOLDOWNS = {
'disks': 86400, # 24h - I/O errors are persistent hardware issues
'smart': 86400, # 24h - SMART errors same as I/O
'zfs': 86400, # 24h - ZFS pool issues are persistent
'storage': 3600, # 1h - storage availability can oscillate
'network': 1800, # 30m - network can flap
'pve_services': 1800, # 30m - services can restart/oscillate
'temperature': 3600, # 1h - temp can fluctuate near thresholds
'logs': 3600, # 1h - repeated log patterns
'vms': 1800, # 30m - VM state oscillation
'security': 3600, # 1h - auth failures tend to be bursty
'cpu': 1800, # 30m - CPU spikes can be transient
'memory': 1800, # 30m - memory pressure oscillation
'disk': 3600, # 1h - disk space can fluctuate near threshold
'updates': 86400, # 24h - update info doesn't change fast
}
_ENTITY_MAP = {
'cpu': ('node', ''), 'memory': ('node', ''), 'temperature': ('node', ''),
'disk': ('storage', ''), 'network': ('network', ''),
'load': ('node', ''),
'disk': ('storage', ''), 'disks': ('storage', ''), 'smart': ('storage', ''),
'zfs': ('storage', ''), 'storage': ('storage', ''),
'network': ('network', ''),
'pve_services': ('node', ''), 'security': ('user', ''),
'updates': ('node', ''), 'storage': ('storage', ''),
'updates': ('node', ''), 'logs': ('node', ''), 'vms': ('vm', ''),
}
# Map health-persistence category names to our TEMPLATES event types.
@@ -1412,14 +1501,14 @@ class PollingCollector:
'load': 'load_high',
'temperature': 'temp_high',
'disk': 'disk_space_low',
'disks': 'disk_io_error', # I/O errors from health monitor
'smart': 'disk_io_error', # SMART errors from health monitor
'zfs': 'disk_io_error', # ZFS pool/disk errors
'storage': 'storage_unavailable',
'network': 'network_down',
'pve_services': 'service_fail',
'security': 'auth_fail',
'updates': 'update_summary',
'zfs': 'disk_io_error',
'smart': 'disk_io_error',
'disks': 'disk_io_error',
'logs': 'system_problem',
'vms': 'system_problem',
}
@@ -1547,34 +1636,46 @@ class PollingCollector:
# Determine if we should notify
is_new = error_key not in self._known_errors
last_sent = self._last_notified.get(error_key, 0)
is_due = (now - last_sent) >= self.DIGEST_INTERVAL
cat_cooldown = self._CATEGORY_COOLDOWNS.get(category, self.DIGEST_INTERVAL)
is_due = (now - last_sent) >= cat_cooldown
# For re-notifications (not new): skip if stale OR not due
# Anti-oscillation: even if "new" (resolved then reappeared),
# respect the per-category cooldown interval. This prevents
# "semi-cascades" where the same root cause generates multiple
# slightly different notifications across health check cycles.
# Each category has its own appropriate cooldown (30m for network,
# 24h for disks, 1h for temperature, etc.).
if not is_due:
continue
# For re-notifications (not new): also skip if stale
if not is_new:
if error_is_stale or not is_due:
if error_is_stale:
continue
# Map to our event type
event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem')
entity, eid = self._ENTITY_MAP.get(category, ('node', ''))
# ── SMART gate for disk errors ──
# If the health monitor recorded a disk error but SMART is NOT
# FAILED, skip the notification entirely. Disk notifications
# should ONLY be sent when SMART confirms a real hardware failure.
# This prevents WARNING-level disk errors (SMART: unavailable)
# from being emitted as notifications at all.
# ── Disk I/O notification policy ──
# Disk I/O errors are ALWAYS notified (even when SMART says Passed)
# because recurring I/O errors are real issues that should not be hidden.
# The 24h cooldown is enforced per-device by NotificationManager
# (event_type 'disk_io_error' gets 86400s cooldown).
# For transient/INFO-level disk events (SMART OK, low error count),
# the health monitor already resolves them, so they won't appear here.
if category in ('disks', 'smart', 'zfs'):
details = error.get('details', {})
if isinstance(details, str):
details_raw = error.get('details', {})
if isinstance(details_raw, str):
try:
details = json.loads(details)
details_raw = json.loads(details_raw)
except (json.JSONDecodeError, TypeError):
details = {}
smart_status = details.get('smart_status', '') if isinstance(details, dict) else ''
if smart_status != 'FAILED':
# SMART is PASSED, UNKNOWN, or unavailable -- don't notify
continue
details_raw = {}
if isinstance(details_raw, dict):
# Extract device name for a stable entity_id (24h cooldown key)
dev = details_raw.get('device', details_raw.get('disk', ''))
if dev:
eid = f'disk_{dev}' # Stable per-device fingerprint
# Updates are always informational notifications except
# system_age which can be WARNING (365+ days) or CRITICAL (548+ days).
@@ -2020,11 +2121,12 @@ class ProxmoxHookWatcher:
msg_lower = (message or '').lower()
title_lower_sm = (title or '').lower()
# ── Filter smartd noise ──
# FailedReadSmartErrorLog: smartd can't read the error log -- this is
# a firmware quirk on some WD/Seagate drives, NOT a disk failure.
# FailedReadSmartData: similar firmware issue.
# These should NOT generate notifications.
# ── Record disk observation regardless of noise filter ──
# Even "noise" events are recorded as observations so the user
# can see them in the Storage UI. We just don't send notifications.
self._record_smartd_observation(title or '', message or '')
# ── Filter smartd noise (suppress notification, not observation) ──
smartd_noise = [
'failedreadsmarterrorlog',
'failedreadsmartdata',

View File

@@ -767,11 +767,29 @@ class NotificationManager:
# Same as Proxmox's notification policy. The JournalWatcher already
# gates these through SMART verification + its own 24h dedup, but
# this acts as defense-in-depth in case a disk event arrives from
# another source (PollingCollector, hooks, etc.).
# another source (PollingCollector, hooks, health monitor, etc.).
_DISK_EVENTS = {'disk_io_error', 'storage_unavailable'}
if event.event_type in _DISK_EVENTS and cooldown_str is None:
cooldown = 86400 # 24 hours
# Health monitor state_change events: per-category cooldowns.
# Different health categories need different re-notification intervals.
# This is the defense-in-depth layer matching HealthEventWatcher's
# _CATEGORY_COOLDOWNS to prevent semi-cascades across all categories.
_HEALTH_CATEGORY_COOLDOWNS = {
'disks': 86400, 'smart': 86400, 'zfs': 86400, # 24h
'storage': 3600, 'temperature': 3600, 'logs': 3600,
'security': 3600, 'disk': 3600, # 1h
'network': 1800, 'pve_services': 1800,
'vms': 1800, 'cpu': 1800, 'memory': 1800, # 30m
'updates': 86400, # 24h
}
if event.event_type == 'state_change' and event.source == 'health':
cat = (event.data or {}).get('category', '')
cat_cd = _HEALTH_CATEGORY_COOLDOWNS.get(cat)
if cat_cd and cooldown_str is None:
cooldown = max(cooldown, cat_cd)
# Backup/replication events: each execution is unique and should
# always be delivered. A 10s cooldown prevents exact duplicates
# (webhook + tasks) but allows repeated backup jobs to report.