mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-18 01:52:20 +00:00
Update notification service
This commit is contained in:
@@ -1418,6 +1418,34 @@ def get_storage_info():
|
||||
# print(f"Error getting partition info: {e}")
|
||||
pass
|
||||
|
||||
# ── Register disks in observation system + enrich with observation counts ──
|
||||
try:
|
||||
active_dev_names = list(physical_disks.keys())
|
||||
obs_counts = health_persistence.get_disks_observation_counts()
|
||||
|
||||
for disk_name, disk_info in physical_disks.items():
|
||||
# Register each disk we see
|
||||
health_persistence.register_disk(
|
||||
device_name=disk_name,
|
||||
serial=disk_info.get('serial', ''),
|
||||
model=disk_info.get('model', ''),
|
||||
size_bytes=disk_info.get('size_bytes'),
|
||||
)
|
||||
|
||||
# Attach observation count: try serial match first, then device name
|
||||
serial = disk_info.get('serial', '')
|
||||
count = obs_counts.get(f'serial:{serial}', 0) if serial else 0
|
||||
if count == 0:
|
||||
count = obs_counts.get(disk_name, 0)
|
||||
disk_info['observations_count'] = count
|
||||
|
||||
# Mark disks no longer present as removed
|
||||
health_persistence.mark_removed_disks(active_dev_names)
|
||||
# Auto-dismiss stale observations (> 30 days old)
|
||||
health_persistence.cleanup_stale_observations()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
storage_data['disks'] = list(physical_disks.values())
|
||||
|
||||
return storage_data
|
||||
|
||||
@@ -135,19 +135,22 @@ class HealthMonitor:
|
||||
# These are logged at ERR level but are common on SATA controllers
|
||||
# during hot-plug, link renegotiation, or cable noise. They are NOT
|
||||
# indicative of disk failure unless SMART also reports problems.
|
||||
r'ata\d+.*SError.*BadCRC',
|
||||
r'ata\d+.*Emask 0x10.*ATA bus error',
|
||||
r'failed command: (READ|WRITE) FPDMA QUEUED',
|
||||
# NOTE: patterns are matched against line.lower(), so use lowercase.
|
||||
r'ata\d+.*serror.*badcrc',
|
||||
r'ata\d+.*emask 0x10.*ata bus error',
|
||||
r'failed command: (read|write) fpdma queued',
|
||||
r'ata\d+.*hard resetting link',
|
||||
r'ata\d+.*link is slow',
|
||||
r'ata\d+.*COMRESET',
|
||||
r'ata\d+.*comreset',
|
||||
|
||||
# ── ProxMenux self-referential noise ──
|
||||
# The monitor reporting its OWN service failures is circular --
|
||||
# it cannot meaningfully alert about itself.
|
||||
r'proxmenux-monitor\.service.*Failed',
|
||||
# NOTE: patterns are matched against line.lower(), so use lowercase.
|
||||
r'proxmenux-monitor\.service.*failed',
|
||||
r'proxmenux-monitor\.service.*exit-code',
|
||||
r'ProxMenux-Monitor.*Failed at step EXEC',
|
||||
r'proxmenux-monitor.*failed at step exec',
|
||||
r'proxmenux-monitor\.appimage',
|
||||
|
||||
# ── PVE scheduler operational noise ──
|
||||
# pvescheduler emits "could not update job state" every minute
|
||||
@@ -1147,6 +1150,42 @@ class HealthMonitor:
|
||||
|
||||
return storages
|
||||
|
||||
@staticmethod
|
||||
def _make_io_obs_signature(disk: str, sample: str) -> str:
|
||||
"""Create a stable observation signature for I/O errors on a disk.
|
||||
|
||||
All ATA errors on the same disk (exception Emask, revalidation failed,
|
||||
hard resetting link, SError, etc.) map to ONE signature per error family.
|
||||
This ensures that "Emask 0x1 SAct 0xc1000000" and "Emask 0x1 SAct 0x804000"
|
||||
and "revalidation failed" all dedup into the same observation.
|
||||
"""
|
||||
if not sample:
|
||||
return f'io_{disk}_generic'
|
||||
|
||||
s = sample.lower()
|
||||
|
||||
# Classify into error families (order matters: first match wins)
|
||||
families = [
|
||||
# ATA controller errors: exception, emask, revalidation, reset
|
||||
# All these are symptoms of the same underlying connection issue
|
||||
(r'exception\s+emask|emask\s+0x|revalidation failed|hard resetting link|'
|
||||
r'serror.*badcrc|comreset|link is slow|status.*drdy',
|
||||
'ata_connection_error'),
|
||||
# SCSI / block-layer errors
|
||||
(r'i/o error|blk_update_request|medium error|sense key',
|
||||
'block_io_error'),
|
||||
# Failed commands (READ/WRITE FPDMA QUEUED)
|
||||
(r'failed command|fpdma queued',
|
||||
'ata_failed_command'),
|
||||
]
|
||||
|
||||
for pattern, family in families:
|
||||
if re.search(pattern, s):
|
||||
return f'io_{disk}_{family}'
|
||||
|
||||
# Fallback: generic per-disk
|
||||
return f'io_{disk}_generic'
|
||||
|
||||
def _resolve_ata_to_disk(self, ata_port: str) -> str:
|
||||
"""Resolve an ATA controller name (e.g. 'ata8') to a block device (e.g. 'sda').
|
||||
|
||||
@@ -1444,6 +1483,26 @@ class HealthMonitor:
|
||||
|
||||
smart_ok = smart_health == 'PASSED'
|
||||
|
||||
# ── Record disk observation (always, even if transient) ──
|
||||
# Signature must be stable across cycles: strip volatile
|
||||
# data (hex values, counts, timestamps) to dedup properly.
|
||||
# e.g. "ata8.00: exception Emask 0x1 SAct 0xc1000000"
|
||||
# and "ata8.00: revalidation failed (errno=-2)"
|
||||
# both map to the same per-device I/O observation.
|
||||
try:
|
||||
obs_sig = self._make_io_obs_signature(disk, sample)
|
||||
obs_severity = 'critical' if smart_health == 'FAILED' else 'warning'
|
||||
health_persistence.record_disk_observation(
|
||||
device_name=disk,
|
||||
serial=None,
|
||||
error_type='io_error',
|
||||
error_signature=obs_sig,
|
||||
raw_message=f'{display}: {error_count} I/O event(s) in 5 min (SMART: {smart_health})\n{sample}',
|
||||
severity=obs_severity,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Transient-only errors (e.g. SError with auto-recovery)
|
||||
# are always INFO regardless of SMART
|
||||
if all_transient:
|
||||
|
||||
@@ -150,6 +150,45 @@ class HealthPersistence:
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_notif_severity ON notification_history(severity)')
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_nls_ts ON notification_last_sent(last_sent_ts)')
|
||||
|
||||
# ── Disk Observations System ──
|
||||
# Registry of all physical disks seen by the system
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS disk_registry (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
device_name TEXT NOT NULL,
|
||||
serial TEXT,
|
||||
model TEXT,
|
||||
size_bytes INTEGER,
|
||||
first_seen TEXT NOT NULL,
|
||||
last_seen TEXT NOT NULL,
|
||||
removed INTEGER DEFAULT 0,
|
||||
UNIQUE(device_name, serial)
|
||||
)
|
||||
''')
|
||||
|
||||
# Observation log: deduplicated error events per disk
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS disk_observations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
disk_registry_id INTEGER NOT NULL,
|
||||
error_type TEXT NOT NULL,
|
||||
error_signature TEXT NOT NULL,
|
||||
first_occurrence TEXT NOT NULL,
|
||||
last_occurrence TEXT NOT NULL,
|
||||
occurrence_count INTEGER DEFAULT 1,
|
||||
raw_message TEXT,
|
||||
severity TEXT DEFAULT 'warning',
|
||||
dismissed INTEGER DEFAULT 0,
|
||||
FOREIGN KEY(disk_registry_id) REFERENCES disk_registry(id),
|
||||
UNIQUE(disk_registry_id, error_type, error_signature)
|
||||
)
|
||||
''')
|
||||
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_disk_serial ON disk_registry(serial)')
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_disk_device ON disk_registry(device_name)')
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_disk ON disk_observations(disk_registry_id)')
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_obs_dismissed ON disk_observations(dismissed)')
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
@@ -519,10 +558,12 @@ class HealthPersistence:
|
||||
}
|
||||
child_prefix = CASCADE_PREFIXES.get(error_key)
|
||||
if child_prefix:
|
||||
# Only cascade to active (unresolved) child errors.
|
||||
# Already-resolved/expired entries must NOT be re-surfaced.
|
||||
cursor.execute('''
|
||||
UPDATE errors
|
||||
SET acknowledged = 1, resolved_at = ?, suppression_hours = ?
|
||||
WHERE error_key LIKE ? AND acknowledged = 0
|
||||
WHERE error_key LIKE ? AND acknowledged = 0 AND resolved_at IS NULL
|
||||
''', (now, sup_hours, child_prefix + '%'))
|
||||
|
||||
result = {
|
||||
@@ -1119,5 +1160,225 @@ class HealthPersistence:
|
||||
print(f"[HealthPersistence] Error recording UNKNOWN persistent: {e}")
|
||||
|
||||
|
||||
# ────────────────────────────────────────────────────────────────
|
||||
# Disk Observations API
|
||||
# ────────────────────────────────────────────────────────────────
|
||||
|
||||
def register_disk(self, device_name: str, serial: Optional[str] = None,
|
||||
model: Optional[str] = None, size_bytes: Optional[int] = None):
|
||||
"""Register or update a physical disk in the registry.
|
||||
|
||||
Uses (device_name, serial) as unique key. If the disk was previously
|
||||
marked removed, it's re-activated.
|
||||
"""
|
||||
now = datetime.now().isoformat()
|
||||
try:
|
||||
conn = self._get_conn()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
INSERT INTO disk_registry (device_name, serial, model, size_bytes, first_seen, last_seen, removed)
|
||||
VALUES (?, ?, ?, ?, ?, ?, 0)
|
||||
ON CONFLICT(device_name, serial) DO UPDATE SET
|
||||
model = COALESCE(excluded.model, model),
|
||||
size_bytes = COALESCE(excluded.size_bytes, size_bytes),
|
||||
last_seen = excluded.last_seen,
|
||||
removed = 0
|
||||
''', (device_name, serial or '', model, size_bytes, now, now))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"[HealthPersistence] Error registering disk {device_name}: {e}")
|
||||
|
||||
def _get_disk_registry_id(self, cursor, device_name: str,
|
||||
serial: Optional[str] = None) -> Optional[int]:
|
||||
"""Find disk_registry.id, matching by serial first, then device_name."""
|
||||
if serial:
|
||||
cursor.execute(
|
||||
'SELECT id FROM disk_registry WHERE serial = ? AND serial != "" ORDER BY last_seen DESC LIMIT 1',
|
||||
(serial,))
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return row[0]
|
||||
# Fallback: match by device_name (strip /dev/ prefix)
|
||||
clean_dev = device_name.replace('/dev/', '')
|
||||
cursor.execute(
|
||||
'SELECT id FROM disk_registry WHERE device_name = ? ORDER BY last_seen DESC LIMIT 1',
|
||||
(clean_dev,))
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
def record_disk_observation(self, device_name: str, serial: Optional[str],
|
||||
error_type: str, error_signature: str,
|
||||
raw_message: str = '',
|
||||
severity: str = 'warning'):
|
||||
"""Record or deduplicate a disk error observation.
|
||||
|
||||
error_type: 'smart_error', 'io_error', 'connection_error'
|
||||
error_signature: Normalized unique string for dedup (e.g. 'FailedReadSmartSelfTestLog')
|
||||
"""
|
||||
now = datetime.now().isoformat()
|
||||
try:
|
||||
conn = self._get_conn()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Auto-register the disk if not present
|
||||
clean_dev = device_name.replace('/dev/', '')
|
||||
self.register_disk(clean_dev, serial)
|
||||
|
||||
disk_id = self._get_disk_registry_id(cursor, clean_dev, serial)
|
||||
if not disk_id:
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# Upsert observation: if same (disk, type, signature), bump count + update last_occurrence
|
||||
cursor.execute('''
|
||||
INSERT INTO disk_observations
|
||||
(disk_registry_id, error_type, error_signature, first_occurrence,
|
||||
last_occurrence, occurrence_count, raw_message, severity, dismissed)
|
||||
VALUES (?, ?, ?, ?, ?, 1, ?, ?, 0)
|
||||
ON CONFLICT(disk_registry_id, error_type, error_signature) DO UPDATE SET
|
||||
last_occurrence = excluded.last_occurrence,
|
||||
occurrence_count = occurrence_count + 1,
|
||||
severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END,
|
||||
dismissed = 0
|
||||
''', (disk_id, error_type, error_signature, now, now, raw_message, severity))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"[HealthPersistence] Error recording disk observation: {e}")
|
||||
|
||||
def get_disk_observations(self, device_name: Optional[str] = None,
|
||||
serial: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""Get active (non-dismissed) observations for one disk or all disks."""
|
||||
try:
|
||||
conn = self._get_conn()
|
||||
cursor = conn.cursor()
|
||||
|
||||
if device_name or serial:
|
||||
disk_id = self._get_disk_registry_id(cursor,
|
||||
device_name or '', serial)
|
||||
if not disk_id:
|
||||
conn.close()
|
||||
return []
|
||||
cursor.execute('''
|
||||
SELECT o.id, o.error_type, o.error_signature,
|
||||
o.first_occurrence, o.last_occurrence,
|
||||
o.occurrence_count, o.raw_message, o.severity, o.dismissed,
|
||||
d.device_name, d.serial, d.model
|
||||
FROM disk_observations o
|
||||
JOIN disk_registry d ON o.disk_registry_id = d.id
|
||||
WHERE o.disk_registry_id = ? AND o.dismissed = 0
|
||||
ORDER BY o.last_occurrence DESC
|
||||
''', (disk_id,))
|
||||
else:
|
||||
cursor.execute('''
|
||||
SELECT o.id, o.error_type, o.error_signature,
|
||||
o.first_occurrence, o.last_occurrence,
|
||||
o.occurrence_count, o.raw_message, o.severity, o.dismissed,
|
||||
d.device_name, d.serial, d.model
|
||||
FROM disk_observations o
|
||||
JOIN disk_registry d ON o.disk_registry_id = d.id
|
||||
WHERE o.dismissed = 0
|
||||
ORDER BY o.last_occurrence DESC
|
||||
''')
|
||||
|
||||
rows = cursor.fetchall()
|
||||
conn.close()
|
||||
|
||||
return [{
|
||||
'id': r[0],
|
||||
'error_type': r[1],
|
||||
'error_signature': r[2],
|
||||
'first_occurrence': r[3],
|
||||
'last_occurrence': r[4],
|
||||
'occurrence_count': r[5],
|
||||
'raw_message': r[6] or '',
|
||||
'severity': r[7],
|
||||
'dismissed': bool(r[8]),
|
||||
'device_name': r[9],
|
||||
'serial': r[10],
|
||||
'model': r[11],
|
||||
} for r in rows]
|
||||
except Exception as e:
|
||||
print(f"[HealthPersistence] Error getting observations: {e}")
|
||||
return []
|
||||
|
||||
def get_disks_observation_counts(self) -> Dict[str, int]:
|
||||
"""Return {device_name: count} of active observations per disk.
|
||||
|
||||
Also includes serial-keyed entries for cross-device matching.
|
||||
"""
|
||||
try:
|
||||
conn = self._get_conn()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT d.device_name, d.serial, COUNT(o.id) as cnt
|
||||
FROM disk_observations o
|
||||
JOIN disk_registry d ON o.disk_registry_id = d.id
|
||||
WHERE o.dismissed = 0
|
||||
GROUP BY d.id
|
||||
''')
|
||||
result = {}
|
||||
for device_name, serial, cnt in cursor.fetchall():
|
||||
result[device_name] = cnt
|
||||
if serial:
|
||||
result[f'serial:{serial}'] = cnt
|
||||
conn.close()
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"[HealthPersistence] Error getting observation counts: {e}")
|
||||
return {}
|
||||
|
||||
def dismiss_disk_observation(self, observation_id: int):
|
||||
"""Mark a single observation as dismissed."""
|
||||
try:
|
||||
conn = self._get_conn()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
'UPDATE disk_observations SET dismissed = 1 WHERE id = ?',
|
||||
(observation_id,))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"[HealthPersistence] Error dismissing observation: {e}")
|
||||
|
||||
def cleanup_stale_observations(self, max_age_days: int = 30):
|
||||
"""Auto-dismiss observations not seen in max_age_days."""
|
||||
try:
|
||||
from datetime import timedelta
|
||||
cutoff = (datetime.now() - timedelta(days=max_age_days)).isoformat()
|
||||
conn = self._get_conn()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
UPDATE disk_observations
|
||||
SET dismissed = 1
|
||||
WHERE dismissed = 0 AND last_occurrence < ?
|
||||
''', (cutoff,))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"[HealthPersistence] Error cleaning stale observations: {e}")
|
||||
|
||||
def mark_removed_disks(self, active_device_names: List[str]):
|
||||
"""Mark disks not in active_device_names as removed."""
|
||||
try:
|
||||
now = datetime.now().isoformat()
|
||||
conn = self._get_conn()
|
||||
cursor = conn.cursor()
|
||||
if active_device_names:
|
||||
placeholders = ','.join('?' for _ in active_device_names)
|
||||
cursor.execute(f'''
|
||||
UPDATE disk_registry SET removed = 1
|
||||
WHERE device_name NOT IN ({placeholders}) AND removed = 0
|
||||
''', active_device_names)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"[HealthPersistence] Error marking removed disks: {e}")
|
||||
|
||||
|
||||
# Global instance
|
||||
health_persistence = HealthPersistence()
|
||||
|
||||
@@ -690,6 +690,68 @@ class JournalWatcher:
|
||||
except Exception:
|
||||
return 'UNKNOWN'
|
||||
|
||||
def _record_smartd_observation(self, title: str, message: str):
|
||||
"""Extract device info from a smartd system-mail and record as disk observation."""
|
||||
try:
|
||||
import re as _re
|
||||
from health_persistence import health_persistence
|
||||
|
||||
# Extract device path: "Device: /dev/sdh [SAT]" or "Device: /dev/sda"
|
||||
dev_match = _re.search(r'Device:\s*/dev/(\S+?)[\s\[\],]', message)
|
||||
device = dev_match.group(1) if dev_match else ''
|
||||
if not device:
|
||||
return
|
||||
# Strip partition suffix and SAT prefix
|
||||
base_dev = _re.sub(r'\d+$', '', device)
|
||||
|
||||
# Extract serial: "S/N:WD-WX72A30AA72R"
|
||||
sn_match = _re.search(r'S/N:\s*(\S+)', message)
|
||||
serial = sn_match.group(1) if sn_match else ''
|
||||
|
||||
# Extract model: appears before S/N on the "Device info:" line
|
||||
model = ''
|
||||
model_match = _re.search(r'Device info:\s*\n?\s*(.+?)(?:,\s*S/N:)', message)
|
||||
if model_match:
|
||||
model = model_match.group(1).strip()
|
||||
|
||||
# Extract error signature from title: "SMART error (FailedReadSmartSelfTestLog)"
|
||||
sig_match = _re.search(r'SMART error\s*\((\w+)\)', title)
|
||||
if sig_match:
|
||||
error_signature = sig_match.group(1)
|
||||
error_type = 'smart_error'
|
||||
else:
|
||||
# Fallback: extract the "warning/error logged" line
|
||||
warn_match = _re.search(
|
||||
r'warning/error was logged.*?:\s*\n?\s*(.+)', message, _re.IGNORECASE)
|
||||
if warn_match:
|
||||
error_signature = _re.sub(r'[^a-zA-Z0-9_]', '_',
|
||||
warn_match.group(1).strip())[:80]
|
||||
else:
|
||||
error_signature = _re.sub(r'[^a-zA-Z0-9_]', '_', title)[:80]
|
||||
error_type = 'smart_error'
|
||||
|
||||
# Build a clean raw_message for display
|
||||
raw_msg = f"Device: /dev/{base_dev}"
|
||||
if model:
|
||||
raw_msg += f" ({model})"
|
||||
if serial:
|
||||
raw_msg += f" S/N:{serial}"
|
||||
warn_line_m = _re.search(
|
||||
r'The following warning/error.*?:\s*\n?\s*(.+)', message, _re.IGNORECASE)
|
||||
if warn_line_m:
|
||||
raw_msg += f"\n{warn_line_m.group(1).strip()}"
|
||||
|
||||
health_persistence.record_disk_observation(
|
||||
device_name=base_dev,
|
||||
serial=serial,
|
||||
error_type=error_type,
|
||||
error_signature=error_signature,
|
||||
raw_message=raw_msg,
|
||||
severity='warning',
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"[DiskIOEventProcessor] Error recording smartd observation: {e}")
|
||||
|
||||
@staticmethod
|
||||
def _translate_ata_error(msg: str) -> str:
|
||||
"""Translate common ATA/SCSI error codes to human-readable descriptions."""
|
||||
@@ -1393,15 +1455,42 @@ class PollingCollector:
|
||||
Tracking is stored in ``notification_last_sent`` (same DB).
|
||||
"""
|
||||
|
||||
DIGEST_INTERVAL = 86400 # 24 h between re-notifications
|
||||
DIGEST_INTERVAL = 86400 # 24 h default between re-notifications
|
||||
UPDATE_CHECK_INTERVAL = 86400 # 24 h between update scans
|
||||
NEW_ERROR_WINDOW = 120 # seconds – errors younger than this are "new"
|
||||
|
||||
# Per-category anti-oscillation cooldowns (seconds).
|
||||
# When an error resolves briefly and reappears, we still respect this
|
||||
# interval before notifying again. This prevents "semi-cascades" where
|
||||
# the same root cause generates many slightly different notifications.
|
||||
#
|
||||
# Key = health_persistence category name
|
||||
# Value = minimum seconds between notifications for the same error_key
|
||||
_CATEGORY_COOLDOWNS = {
|
||||
'disks': 86400, # 24h - I/O errors are persistent hardware issues
|
||||
'smart': 86400, # 24h - SMART errors same as I/O
|
||||
'zfs': 86400, # 24h - ZFS pool issues are persistent
|
||||
'storage': 3600, # 1h - storage availability can oscillate
|
||||
'network': 1800, # 30m - network can flap
|
||||
'pve_services': 1800, # 30m - services can restart/oscillate
|
||||
'temperature': 3600, # 1h - temp can fluctuate near thresholds
|
||||
'logs': 3600, # 1h - repeated log patterns
|
||||
'vms': 1800, # 30m - VM state oscillation
|
||||
'security': 3600, # 1h - auth failures tend to be bursty
|
||||
'cpu': 1800, # 30m - CPU spikes can be transient
|
||||
'memory': 1800, # 30m - memory pressure oscillation
|
||||
'disk': 3600, # 1h - disk space can fluctuate near threshold
|
||||
'updates': 86400, # 24h - update info doesn't change fast
|
||||
}
|
||||
|
||||
_ENTITY_MAP = {
|
||||
'cpu': ('node', ''), 'memory': ('node', ''), 'temperature': ('node', ''),
|
||||
'disk': ('storage', ''), 'network': ('network', ''),
|
||||
'load': ('node', ''),
|
||||
'disk': ('storage', ''), 'disks': ('storage', ''), 'smart': ('storage', ''),
|
||||
'zfs': ('storage', ''), 'storage': ('storage', ''),
|
||||
'network': ('network', ''),
|
||||
'pve_services': ('node', ''), 'security': ('user', ''),
|
||||
'updates': ('node', ''), 'storage': ('storage', ''),
|
||||
'updates': ('node', ''), 'logs': ('node', ''), 'vms': ('vm', ''),
|
||||
}
|
||||
|
||||
# Map health-persistence category names to our TEMPLATES event types.
|
||||
@@ -1412,14 +1501,14 @@ class PollingCollector:
|
||||
'load': 'load_high',
|
||||
'temperature': 'temp_high',
|
||||
'disk': 'disk_space_low',
|
||||
'disks': 'disk_io_error', # I/O errors from health monitor
|
||||
'smart': 'disk_io_error', # SMART errors from health monitor
|
||||
'zfs': 'disk_io_error', # ZFS pool/disk errors
|
||||
'storage': 'storage_unavailable',
|
||||
'network': 'network_down',
|
||||
'pve_services': 'service_fail',
|
||||
'security': 'auth_fail',
|
||||
'updates': 'update_summary',
|
||||
'zfs': 'disk_io_error',
|
||||
'smart': 'disk_io_error',
|
||||
'disks': 'disk_io_error',
|
||||
'logs': 'system_problem',
|
||||
'vms': 'system_problem',
|
||||
}
|
||||
@@ -1547,34 +1636,46 @@ class PollingCollector:
|
||||
# Determine if we should notify
|
||||
is_new = error_key not in self._known_errors
|
||||
last_sent = self._last_notified.get(error_key, 0)
|
||||
is_due = (now - last_sent) >= self.DIGEST_INTERVAL
|
||||
cat_cooldown = self._CATEGORY_COOLDOWNS.get(category, self.DIGEST_INTERVAL)
|
||||
is_due = (now - last_sent) >= cat_cooldown
|
||||
|
||||
# For re-notifications (not new): skip if stale OR not due
|
||||
# Anti-oscillation: even if "new" (resolved then reappeared),
|
||||
# respect the per-category cooldown interval. This prevents
|
||||
# "semi-cascades" where the same root cause generates multiple
|
||||
# slightly different notifications across health check cycles.
|
||||
# Each category has its own appropriate cooldown (30m for network,
|
||||
# 24h for disks, 1h for temperature, etc.).
|
||||
if not is_due:
|
||||
continue
|
||||
|
||||
# For re-notifications (not new): also skip if stale
|
||||
if not is_new:
|
||||
if error_is_stale or not is_due:
|
||||
if error_is_stale:
|
||||
continue
|
||||
|
||||
# Map to our event type
|
||||
event_type = self._CATEGORY_TO_EVENT_TYPE.get(category, 'system_problem')
|
||||
entity, eid = self._ENTITY_MAP.get(category, ('node', ''))
|
||||
|
||||
# ── SMART gate for disk errors ──
|
||||
# If the health monitor recorded a disk error but SMART is NOT
|
||||
# FAILED, skip the notification entirely. Disk notifications
|
||||
# should ONLY be sent when SMART confirms a real hardware failure.
|
||||
# This prevents WARNING-level disk errors (SMART: unavailable)
|
||||
# from being emitted as notifications at all.
|
||||
# ── Disk I/O notification policy ──
|
||||
# Disk I/O errors are ALWAYS notified (even when SMART says Passed)
|
||||
# because recurring I/O errors are real issues that should not be hidden.
|
||||
# The 24h cooldown is enforced per-device by NotificationManager
|
||||
# (event_type 'disk_io_error' gets 86400s cooldown).
|
||||
# For transient/INFO-level disk events (SMART OK, low error count),
|
||||
# the health monitor already resolves them, so they won't appear here.
|
||||
if category in ('disks', 'smart', 'zfs'):
|
||||
details = error.get('details', {})
|
||||
if isinstance(details, str):
|
||||
details_raw = error.get('details', {})
|
||||
if isinstance(details_raw, str):
|
||||
try:
|
||||
details = json.loads(details)
|
||||
details_raw = json.loads(details_raw)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
details = {}
|
||||
smart_status = details.get('smart_status', '') if isinstance(details, dict) else ''
|
||||
if smart_status != 'FAILED':
|
||||
# SMART is PASSED, UNKNOWN, or unavailable -- don't notify
|
||||
continue
|
||||
details_raw = {}
|
||||
if isinstance(details_raw, dict):
|
||||
# Extract device name for a stable entity_id (24h cooldown key)
|
||||
dev = details_raw.get('device', details_raw.get('disk', ''))
|
||||
if dev:
|
||||
eid = f'disk_{dev}' # Stable per-device fingerprint
|
||||
|
||||
# Updates are always informational notifications except
|
||||
# system_age which can be WARNING (365+ days) or CRITICAL (548+ days).
|
||||
@@ -2020,11 +2121,12 @@ class ProxmoxHookWatcher:
|
||||
msg_lower = (message or '').lower()
|
||||
title_lower_sm = (title or '').lower()
|
||||
|
||||
# ── Filter smartd noise ──
|
||||
# FailedReadSmartErrorLog: smartd can't read the error log -- this is
|
||||
# a firmware quirk on some WD/Seagate drives, NOT a disk failure.
|
||||
# FailedReadSmartData: similar firmware issue.
|
||||
# These should NOT generate notifications.
|
||||
# ── Record disk observation regardless of noise filter ──
|
||||
# Even "noise" events are recorded as observations so the user
|
||||
# can see them in the Storage UI. We just don't send notifications.
|
||||
self._record_smartd_observation(title or '', message or '')
|
||||
|
||||
# ── Filter smartd noise (suppress notification, not observation) ──
|
||||
smartd_noise = [
|
||||
'failedreadsmarterrorlog',
|
||||
'failedreadsmartdata',
|
||||
|
||||
@@ -767,11 +767,29 @@ class NotificationManager:
|
||||
# Same as Proxmox's notification policy. The JournalWatcher already
|
||||
# gates these through SMART verification + its own 24h dedup, but
|
||||
# this acts as defense-in-depth in case a disk event arrives from
|
||||
# another source (PollingCollector, hooks, etc.).
|
||||
# another source (PollingCollector, hooks, health monitor, etc.).
|
||||
_DISK_EVENTS = {'disk_io_error', 'storage_unavailable'}
|
||||
if event.event_type in _DISK_EVENTS and cooldown_str is None:
|
||||
cooldown = 86400 # 24 hours
|
||||
|
||||
# Health monitor state_change events: per-category cooldowns.
|
||||
# Different health categories need different re-notification intervals.
|
||||
# This is the defense-in-depth layer matching HealthEventWatcher's
|
||||
# _CATEGORY_COOLDOWNS to prevent semi-cascades across all categories.
|
||||
_HEALTH_CATEGORY_COOLDOWNS = {
|
||||
'disks': 86400, 'smart': 86400, 'zfs': 86400, # 24h
|
||||
'storage': 3600, 'temperature': 3600, 'logs': 3600,
|
||||
'security': 3600, 'disk': 3600, # 1h
|
||||
'network': 1800, 'pve_services': 1800,
|
||||
'vms': 1800, 'cpu': 1800, 'memory': 1800, # 30m
|
||||
'updates': 86400, # 24h
|
||||
}
|
||||
if event.event_type == 'state_change' and event.source == 'health':
|
||||
cat = (event.data or {}).get('category', '')
|
||||
cat_cd = _HEALTH_CATEGORY_COOLDOWNS.get(cat)
|
||||
if cat_cd and cooldown_str is None:
|
||||
cooldown = max(cooldown, cat_cd)
|
||||
|
||||
# Backup/replication events: each execution is unique and should
|
||||
# always be delivered. A 10s cooldown prevents exact duplicates
|
||||
# (webhook + tasks) but allows repeated backup jobs to report.
|
||||
|
||||
Reference in New Issue
Block a user