mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-14 08:02:16 +00:00
update notification_events.py
This commit is contained in:
@@ -739,6 +739,14 @@ class HealthPersistence:
|
||||
}
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
# ── Clear cooldowns for newly dismissed errors too ──
|
||||
if sup_hours != -1:
|
||||
if category == 'disks':
|
||||
self._clear_disk_io_cooldown(error_key)
|
||||
else:
|
||||
self._clear_notification_cooldown(error_key)
|
||||
|
||||
return result
|
||||
|
||||
if row:
|
||||
@@ -803,6 +811,19 @@ class HealthPersistence:
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
# ── Coordinate with notification cooldowns ──
|
||||
# When an error is dismissed with non-permanent suppression,
|
||||
# clear the corresponding cooldown in notification_last_sent
|
||||
# so it can re-notify after the suppression period expires.
|
||||
# This applies to ALL categories, not just disks.
|
||||
if sup_hours != -1:
|
||||
if category == 'disks':
|
||||
self._clear_disk_io_cooldown(error_key)
|
||||
else:
|
||||
# For non-disk categories, clear the PollingCollector cooldown
|
||||
self._clear_notification_cooldown(error_key)
|
||||
|
||||
return result
|
||||
|
||||
def is_error_acknowledged(self, error_key: str) -> bool:
|
||||
@@ -2732,5 +2753,123 @@ class HealthPersistence:
|
||||
return set()
|
||||
|
||||
|
||||
def _clear_notification_cooldown(self, error_key: str):
|
||||
"""
|
||||
Clear notification cooldown from notification_last_sent for non-disk errors.
|
||||
|
||||
This coordinates with PollingCollector's 24h cooldown system.
|
||||
When any error is dismissed, we remove the corresponding cooldown entry
|
||||
so the error can be re-detected and re-notified after the suppression period expires.
|
||||
|
||||
The PollingCollector uses 'health_' prefix for all its fingerprints.
|
||||
"""
|
||||
try:
|
||||
conn = self._get_conn()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# PollingCollector uses 'health_' prefix
|
||||
fp = f'health_{error_key}'
|
||||
cursor.execute(
|
||||
'DELETE FROM notification_last_sent WHERE fingerprint = ?',
|
||||
(fp,)
|
||||
)
|
||||
|
||||
# Also delete any fingerprints that match the error_key pattern
|
||||
cursor.execute(
|
||||
'DELETE FROM notification_last_sent WHERE fingerprint LIKE ?',
|
||||
(f'%{error_key}%',)
|
||||
)
|
||||
|
||||
deleted_count = cursor.rowcount
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
if deleted_count > 0:
|
||||
print(f"[HealthPersistence] Cleared notification cooldowns for {error_key}")
|
||||
except Exception as e:
|
||||
print(f"[HealthPersistence] Error clearing notification cooldown: {e}")
|
||||
|
||||
def _clear_disk_io_cooldown(self, error_key: str):
|
||||
"""
|
||||
Clear disk I/O cooldowns from notification_last_sent when an error is dismissed.
|
||||
|
||||
This coordinates with BOTH:
|
||||
1. JournalWatcher's 24h cooldown system (prefixes: diskio_, fs_, fs_serial_)
|
||||
2. PollingCollector's 24h cooldown system (prefix: health_)
|
||||
|
||||
When a disk error is dismissed, we remove the corresponding cooldown entries
|
||||
so the error can be re-detected and re-notified after the suppression period expires.
|
||||
|
||||
Matches fingerprints like:
|
||||
- diskio_sdh, diskio_sda, diskio_nvme0n1
|
||||
- fs_sdh1, fs_sda2, fs_serial_XXXXX
|
||||
- health_disk_smart_sdh, health_disk_io_error_sdh
|
||||
- sdh (direct device name used by JournalWatcher)
|
||||
"""
|
||||
try:
|
||||
# Extract device name from error_key
|
||||
# Common patterns: disk_fs_sdh, disk_smart_sda, disk_io_error_sdh, smart_sdh
|
||||
import re
|
||||
device_match = re.search(r'(?:disk_fs_|disk_smart_|disk_io_error_|disk_|smart_|io_error_)(?:/dev/)?([a-z]{2,4}[a-z0-9]*)', error_key)
|
||||
if not device_match:
|
||||
# Try to extract device from error_key directly if no pattern matches
|
||||
# e.g., error_key might just be the device name
|
||||
device_match = re.match(r'^([a-z]{2,4}[a-z0-9]*)$', error_key)
|
||||
if not device_match:
|
||||
return
|
||||
|
||||
device = device_match.group(1)
|
||||
base_device = re.sub(r'\d+$', '', device) # sdh1 -> sdh
|
||||
|
||||
# Build patterns to match in notification_last_sent
|
||||
# JournalWatcher uses: direct device name, diskio_, fs_, fs_serial_
|
||||
# PollingCollector uses: health_ prefix
|
||||
patterns = [
|
||||
# JournalWatcher patterns
|
||||
device, # Direct device name (JournalWatcher._check_disk_io uses this)
|
||||
base_device,
|
||||
f'diskio_{device}',
|
||||
f'diskio_{base_device}',
|
||||
f'fs_{device}',
|
||||
f'fs_{base_device}',
|
||||
# PollingCollector patterns (uses health_ prefix)
|
||||
f'health_{error_key}',
|
||||
f'health_disk_smart_{device}',
|
||||
f'health_disk_smart_{base_device}',
|
||||
f'health_disk_io_error_{device}',
|
||||
f'health_disk_io_error_{base_device}',
|
||||
f'health_disk_fs_{device}',
|
||||
f'health_disk_fs_{base_device}',
|
||||
]
|
||||
|
||||
conn = self._get_conn()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Delete matching cooldown entries
|
||||
for pattern in patterns:
|
||||
cursor.execute(
|
||||
'DELETE FROM notification_last_sent WHERE fingerprint = ?',
|
||||
(pattern,)
|
||||
)
|
||||
# Also match with wildcards for serial-based keys
|
||||
cursor.execute(
|
||||
'DELETE FROM notification_last_sent WHERE fingerprint LIKE ?',
|
||||
(f'{pattern}%',)
|
||||
)
|
||||
|
||||
# Also clear fingerprints that contain the device name anywhere
|
||||
# This catches edge cases like different fingerprint formats
|
||||
cursor.execute(
|
||||
'DELETE FROM notification_last_sent WHERE fingerprint LIKE ? OR fingerprint LIKE ?',
|
||||
(f'%{device}%', f'%{base_device}%' if base_device != device else f'%{device}%')
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print(f"[HealthPersistence] Cleared disk I/O cooldowns for {error_key} (device: {device})")
|
||||
except Exception as e:
|
||||
print(f"[HealthPersistence] Error clearing disk I/O cooldown: {e}")
|
||||
|
||||
|
||||
# Global instance
|
||||
health_persistence = HealthPersistence()
|
||||
|
||||
@@ -319,6 +319,41 @@ class JournalWatcher:
|
||||
except Exception as e:
|
||||
print(f"[JournalWatcher] Failed to save disk_io_notified: {e}")
|
||||
|
||||
def _get_disk_io_cooldown_from_db(self, device: str) -> Optional[float]:
|
||||
"""
|
||||
Get disk I/O cooldown timestamp from DB for a device.
|
||||
|
||||
Used to re-check DB when user might have dismissed the error,
|
||||
which clears the DB entry via health_persistence._clear_disk_io_cooldown().
|
||||
|
||||
Returns the timestamp if found and within 24h window, None otherwise.
|
||||
"""
|
||||
try:
|
||||
db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
|
||||
if not db_path.exists():
|
||||
return None
|
||||
conn = sqlite3.connect(str(db_path), timeout=5)
|
||||
conn.execute('PRAGMA busy_timeout=3000')
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check for the device with various prefixes
|
||||
# JournalWatcher uses direct device names as keys
|
||||
cursor.execute(
|
||||
"SELECT last_sent_ts FROM notification_last_sent WHERE fingerprint = ?",
|
||||
(device,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if row:
|
||||
ts = float(row[0])
|
||||
# Only return if within 24h window
|
||||
if time.time() - ts < self._DISK_IO_COOLDOWN:
|
||||
return ts
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def stop(self):
|
||||
"""Stop the journal watcher."""
|
||||
self._running = False
|
||||
@@ -589,7 +624,14 @@ class JournalWatcher:
|
||||
fs_dedup_key = f'fs_{device}'
|
||||
last_fs_notified = self._disk_io_notified.get(fs_dedup_key, 0)
|
||||
if now_fs - last_fs_notified < self._DISK_IO_COOLDOWN:
|
||||
return # Already notified for this device recently
|
||||
# In-memory says cooldown active. Re-check DB in case
|
||||
# user dismissed the error (which clears DB cooldowns).
|
||||
db_ts = self._get_disk_io_cooldown_from_db(fs_dedup_key)
|
||||
if db_ts is not None and now_fs - db_ts < self._DISK_IO_COOLDOWN:
|
||||
return # DB confirms cooldown is still active
|
||||
# DB says cooldown was cleared - proceed
|
||||
if fs_dedup_key in self._disk_io_notified:
|
||||
del self._disk_io_notified[fs_dedup_key]
|
||||
|
||||
# ── Device existence gating ──
|
||||
device_exists = base_dev and _os.path.exists(f'/dev/{base_dev}')
|
||||
@@ -842,10 +884,24 @@ class JournalWatcher:
|
||||
return
|
||||
|
||||
# ── Gate 2: 24-hour dedup per device ──
|
||||
# Check both in-memory cache AND the DB (user dismiss clears DB cooldowns).
|
||||
# If user dismissed the error, _clear_disk_io_cooldown() removed the DB
|
||||
# entry, so we should refresh from DB to get the real state.
|
||||
now = time.time()
|
||||
|
||||
# First check in-memory cache
|
||||
last_notified = self._disk_io_notified.get(resolved, 0)
|
||||
|
||||
if now - last_notified < self._DISK_IO_COOLDOWN:
|
||||
return # Already notified for this disk recently
|
||||
# In-memory says we already notified. But user might have dismissed
|
||||
# the error, which clears the DB. Re-check DB to be sure.
|
||||
db_ts = self._get_disk_io_cooldown_from_db(resolved)
|
||||
if db_ts is not None and now - db_ts < self._DISK_IO_COOLDOWN:
|
||||
return # DB confirms cooldown is still active
|
||||
# DB says cooldown was cleared (user dismissed) - proceed to notify
|
||||
# Update in-memory cache
|
||||
del self._disk_io_notified[resolved]
|
||||
|
||||
self._disk_io_notified[resolved] = now
|
||||
self._save_disk_io_notified(resolved, now)
|
||||
|
||||
@@ -2069,8 +2125,16 @@ class PollingCollector:
|
||||
# ── SAME ERROR COOLDOWN (24h) ──
|
||||
# The SAME error_key cannot be re-notified before 24 hours.
|
||||
# This is the PRIMARY deduplication mechanism.
|
||||
# EXCEPTION: If user dismissed the error, the cooldown is cleared in DB
|
||||
# and we should re-check DB to see if cooldown still applies.
|
||||
if time_since_last < self.SAME_ERROR_COOLDOWN:
|
||||
continue
|
||||
# Check if user dismissed this - clears DB cooldown
|
||||
db_ts = self._get_cooldown_from_db(error_key)
|
||||
if db_ts is not None and now - db_ts < self.SAME_ERROR_COOLDOWN:
|
||||
continue # DB confirms cooldown still active
|
||||
# DB says cooldown was cleared (user dismissed) - remove from memory
|
||||
self._last_notified.pop(error_key, None)
|
||||
# Continue to the next checks (category cooldown etc.)
|
||||
|
||||
# ── CATEGORY COOLDOWN (varies) ──
|
||||
# DIFFERENT errors within the same category respect category cooldown.
|
||||
@@ -2735,6 +2799,41 @@ class PollingCollector:
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _get_cooldown_from_db(self, error_key: str) -> Optional[float]:
|
||||
"""
|
||||
Get cooldown timestamp from DB for an error_key.
|
||||
|
||||
Used to re-check DB when user might have dismissed the error,
|
||||
which clears the DB entry via health_persistence._clear_disk_io_cooldown().
|
||||
|
||||
Returns the timestamp if found and within 24h window, None otherwise.
|
||||
"""
|
||||
try:
|
||||
db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
|
||||
if not db_path.exists():
|
||||
return None
|
||||
conn = sqlite3.connect(str(db_path), timeout=5)
|
||||
conn.execute('PRAGMA busy_timeout=3000')
|
||||
cursor = conn.cursor()
|
||||
|
||||
# PollingCollector uses 'health_' prefix for its fingerprints
|
||||
fp = f'health_{error_key}'
|
||||
cursor.execute(
|
||||
"SELECT last_sent_ts FROM notification_last_sent WHERE fingerprint = ?",
|
||||
(fp,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if row:
|
||||
ts = float(row[0])
|
||||
# Only return if within 24h window
|
||||
if time.time() - ts < self.SAME_ERROR_COOLDOWN:
|
||||
return ts
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
# ─── Proxmox Webhook Receiver ───────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user