Update notification service

This commit is contained in:
MacRimi
2026-03-05 19:25:05 +01:00
parent 898392725a
commit 5af51096d8
2 changed files with 294 additions and 33 deletions

View File

@@ -290,7 +290,7 @@ export function StorageOverview() {
}
const obsTypeLabel = (t: string) =>
({ smart_error: 'SMART Error', io_error: 'I/O Error', connection_error: 'Connection Error' }[t] || t)
({ smart_error: 'SMART Error', io_error: 'I/O Error', filesystem_error: 'Filesystem Error', zfs_pool_error: 'ZFS Pool Error', connection_error: 'Connection Error' }[t] || t)
const getStorageTypeBadge = (type: string) => {
const typeColors: Record<string, string> = {

View File

@@ -967,15 +967,96 @@ class HealthMonitor:
for pool_name, pool_info in zfs_pool_issues.items():
issues.append(f'{pool_name}: {pool_info["reason"]}')
storage_details[pool_name] = pool_info
# Record error for notification system
real_pool = pool_info.get('pool_name', pool_name)
zfs_error_key = f'zfs_pool_{real_pool}'
zfs_reason = f'ZFS pool {real_pool}: {pool_info["reason"]}'
try:
if not health_persistence.is_error_active(zfs_error_key, category='zfs'):
health_persistence.record_error(
error_key=zfs_error_key,
category='zfs',
severity=pool_info.get('status', 'WARNING'),
reason=zfs_reason,
details={
'pool_name': real_pool,
'health': pool_info.get('health', ''),
'device': f'zpool:{real_pool}',
'dismissable': False,
}
)
except Exception:
pass
# Record as permanent disk observation
try:
health_persistence.record_disk_observation(
device_name=f'zpool_{real_pool}',
serial=None,
error_type='zfs_pool_error',
error_signature=f'zfs_{real_pool}_{pool_info.get("health", "unknown")}',
raw_message=zfs_reason,
severity=pool_info.get('status', 'WARNING').lower(),
)
except Exception:
pass
else:
# ZFS pools are healthy -- clear any previously recorded ZFS errors
if self.capabilities.get('has_zfs'):
try:
active_errors = health_persistence.get_active_errors()
for error in active_errors:
if error.get('error_key', '').startswith('zfs_pool_'):
health_persistence.clear_error(error['error_key'])
except Exception:
pass
# Check disk health from Proxmox task log or system logs (SMART, etc.)
disk_health_issues = self._check_disk_health_from_events()
smart_warnings_found = False
if disk_health_issues:
for disk, issue in disk_health_issues.items():
# Only add if not already covered by critical mountpoint issues
if disk not in storage_details or storage_details[disk].get('status') == 'OK':
issues.append(f'{disk}: {issue["reason"]}')
storage_details[disk] = issue
# Track if any SMART warnings were found (for smart_health sub-check)
if issue.get('smart_lines'):
smart_warnings_found = True
# Record error with full details for notification system
# Avoid duplicate: if dmesg I/O errors already cover this disk
# (disk_{device}), skip the journal SMART notification to prevent
# the user getting two alerts for the same underlying problem.
device = issue.get('device', disk.replace('/dev/', ''))
io_error_key = f'disk_{device}'
error_key = f'smart_{device}'
reason = f'{disk}: {issue["reason"]}'
try:
if (not health_persistence.is_error_active(io_error_key, category='disks') and
not health_persistence.is_error_active(error_key, category='disks')):
health_persistence.record_error(
error_key=error_key,
category='disks',
severity=issue.get('status', 'WARNING'),
reason=reason,
details={
'disk': device,
'device': disk,
'block_device': device,
'serial': '',
'smart_status': 'WARNING',
'smart_lines': issue.get('smart_lines', []),
'io_lines': issue.get('io_lines', []),
'sample': issue.get('sample', ''),
'source': 'journal',
'dismissable': True,
}
)
except Exception:
pass
# Check LVM status
lvm_status = self._check_lvm()
@@ -1014,7 +1095,16 @@ class HealthMonitor:
if not has_io:
checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'}
if self.capabilities.get('has_smart') and 'smart_health' not in checks:
checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
if smart_warnings_found:
# Collect the actual warning details for the sub-check
smart_details_parts = []
for disk_path, issue in disk_health_issues.items():
for sl in (issue.get('smart_lines') or [])[:3]:
smart_details_parts.append(sl)
detail_text = '; '.join(smart_details_parts[:3]) if smart_details_parts else 'SMART warning in journal'
checks['smart_health'] = {'status': 'WARNING', 'detail': detail_text}
else:
checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
if self.capabilities.get('has_zfs') and 'zfs_pools' not in checks:
checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'}
if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks:
@@ -2743,6 +2833,7 @@ class HealthMonitor:
details={
'disk': base_device,
'device': f'/dev/{fs_device}',
'block_device': base_device,
'error_type': 'filesystem',
'error_count': 1,
'sample': line[:200],
@@ -2751,6 +2842,31 @@ class HealthMonitor:
'device_exists': device_exists,
}
)
# Record filesystem error as permanent disk observation
try:
obs_serial = None
try:
sm = subprocess.run(
['smartctl', '-i', f'/dev/{base_device}'],
capture_output=True, text=True, timeout=3)
if sm.returncode in (0, 4):
for sline in sm.stdout.split('\n'):
if 'Serial Number' in sline or 'Serial number' in sline:
obs_serial = sline.split(':')[-1].strip()
break
except Exception:
pass
health_persistence.record_disk_observation(
device_name=base_device,
serial=obs_serial,
error_type='filesystem_error',
error_signature=f'fs_error_{fs_device}_{pattern_key}',
raw_message=enriched_reason[:500],
severity=fs_severity.lower(),
)
except Exception:
pass
recent_patterns[pattern] += 1
@@ -3654,50 +3770,195 @@ class HealthMonitor:
def _check_disk_health_from_events(self) -> Dict[str, Any]:
"""
Check for disk health warnings/errors from system logs (journalctl).
Looks for SMART warnings and specific disk errors.
Returns dict of disk issues found.
Looks for SMART warnings, smartd messages, and specific disk errors.
Returns dict keyed by '/dev/sdX' with detailed issue info including
the actual log lines that triggered the warning, so notifications
and the health monitor show actionable information.
"""
disk_issues = {}
disk_issues: Dict[str, Any] = {}
try:
# Check journalctl for warnings/errors related to disks in the last hour
# Include smartd (SMART daemon) messages explicitly
result = subprocess.run(
['journalctl', '--since', '1 hour ago', '--no-pager', '-p', 'warning'],
['journalctl', '--since', '1 hour ago', '--no-pager', '-p', 'warning',
'--output=short-precise'],
capture_output=True,
text=True,
timeout=3
timeout=5
)
if result.returncode == 0:
for line in result.stdout.split('\n'):
line_lower = line.lower()
if result.returncode != 0:
return disk_issues
# Collect all relevant lines per disk
# disk_lines[disk_name] = {'smart_lines': [], 'io_lines': [], 'severity': 'WARNING'}
disk_lines: Dict[str, Dict] = {}
for line in result.stdout.split('\n'):
if not line.strip():
continue
line_lower = line.lower()
# Extract disk name -- multiple patterns for different log formats:
# /dev/sdh, /dev/nvme0n1
# Device: /dev/sdh [SAT] (smartd format)
# smartd[1234]: Device: /dev/sdh ...
disk_match = re.search(
r'(?:/dev/|Device:?\s*/dev/)(sd[a-z]+|nvme\d+n\d+|hd[a-z]+)',
line)
if not disk_match:
# Fallback for smartd messages that reference disk names differently
if 'smartd' in line_lower or 'smart' in line_lower:
disk_match = re.search(r'\b(sd[a-z]+|nvme\d+n\d+)\b', line)
if not disk_match:
continue
disk_name = disk_match.group(1)
if disk_name not in disk_lines:
disk_lines[disk_name] = {
'smart_lines': [], 'io_lines': [],
'severity': 'WARNING'
}
# Classify the log line
# SMART warnings: smartd messages, SMART attribute warnings, etc.
if ('smart' in line_lower and
any(kw in line_lower for kw in
['warning', 'error', 'fail', 'exceeded', 'threshold',
'reallocat', 'pending', 'uncorrect', 'crc', 'offline',
'temperature', 'current_pending', 'reported_uncorrect'])):
# Extract the meaningful part of the log line (after hostname)
msg_part = line.split(': ', 2)[-1] if ': ' in line else line
disk_lines[disk_name]['smart_lines'].append(msg_part.strip())
# smartd daemon messages (e.g. "smartd[1234]: Device: /dev/sdh ...")
elif 'smartd' in line_lower:
msg_part = line.split(': ', 2)[-1] if ': ' in line else line
disk_lines[disk_name]['smart_lines'].append(msg_part.strip())
# Disk I/O / medium errors
elif any(kw in line_lower for kw in
['disk error', 'ata error', 'medium error', 'io error',
'i/o error', 'blk_update_request', 'sense key']):
msg_part = line.split(': ', 2)[-1] if ': ' in line else line
disk_lines[disk_name]['io_lines'].append(msg_part.strip())
disk_lines[disk_name]['severity'] = 'CRITICAL'
# Build issues with detailed reasons
for disk_name, info in disk_lines.items():
dev_path = f'/dev/{disk_name}'
smart_lines = info['smart_lines']
io_lines = info['io_lines']
severity = info['severity']
if not smart_lines and not io_lines:
continue
# Build a descriptive reason from the actual log entries
# Deduplicate similar messages (keep unique ones)
seen_msgs = set()
unique_smart = []
for msg in smart_lines:
# Normalize for dedup: strip timestamps and volatile parts
norm = re.sub(r'\d{4}-\d{2}-\d{2}|\d{2}:\d{2}:\d{2}', '', msg).strip()
if norm not in seen_msgs:
seen_msgs.add(norm)
unique_smart.append(msg)
unique_io = []
for msg in io_lines:
norm = re.sub(r'\d{4}-\d{2}-\d{2}|\d{2}:\d{2}:\d{2}', '', msg).strip()
if norm not in seen_msgs:
seen_msgs.add(norm)
unique_io.append(msg)
# Compose the reason with actual details
parts = []
if unique_smart:
if len(unique_smart) == 1:
parts.append(unique_smart[0])
else:
parts.append(f'{len(unique_smart)} SMART warnings')
# Include the first 3 most relevant entries
for entry in unique_smart[:3]:
parts.append(f' - {entry}')
if unique_io:
if len(unique_io) == 1:
parts.append(unique_io[0])
else:
parts.append(f'{len(unique_io)} I/O errors')
for entry in unique_io[:3]:
parts.append(f' - {entry}')
reason = '\n'.join(parts) if parts else 'SMART/disk warning in system logs'
# Keep first sample line for observation recording
sample_line = (unique_smart[0] if unique_smart else
unique_io[0] if unique_io else '')
disk_issues[dev_path] = {
'status': severity,
'reason': reason,
'device': disk_name,
'smart_lines': unique_smart[:5],
'io_lines': unique_io[:5],
'sample': sample_line,
'source': 'journal',
}
# Record as disk observation for the permanent history
try:
obs_type = 'smart_error' if unique_smart else 'io_error'
# Build a stable signature from the error family, not the volatile details
if unique_smart:
sig_base = 'smart_journal'
# Classify SMART warnings by type
all_text = ' '.join(unique_smart).lower()
if any(kw in all_text for kw in ['reallocat', 'pending', 'uncorrect']):
sig_base = 'smart_sector_issues'
elif 'temperature' in all_text:
sig_base = 'smart_temperature'
elif 'crc' in all_text or 'udma' in all_text:
sig_base = 'smart_crc_errors'
elif 'fail' in all_text:
sig_base = 'smart_test_failed'
else:
sig_base = 'journal_io_error'
# Check for SMART warnings/errors
if 'smart' in line_lower and ('warning' in line_lower or 'error' in line_lower or 'fail' in line_lower):
# Extract disk name using regex for common disk identifiers
disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+|hd\d+)', line)
if disk_match:
disk_name = disk_match.group(1)
# Prioritize CRITICAL if already warned, otherwise set to WARNING
if disk_name not in disk_issues or disk_issues[f'/dev/{disk_name}']['status'] != 'CRITICAL':
disk_issues[f'/dev/{disk_name}'] = {
'status': 'WARNING',
'reason': 'SMART warning detected'
}
obs_sig = f'{sig_base}_{disk_name}'
# Check for specific disk I/O or medium errors
if any(keyword in line_lower for keyword in ['disk error', 'ata error', 'medium error', 'io error']):
disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+|hd\d+)', line)
if disk_match:
disk_name = disk_match.group(1)
disk_issues[f'/dev/{disk_name}'] = {
'status': 'CRITICAL',
'reason': 'Disk error detected'
}
# Try to get serial for proper cross-referencing
obs_serial = None
try:
sm = subprocess.run(
['smartctl', '-i', dev_path],
capture_output=True, text=True, timeout=3)
if sm.returncode in (0, 4):
for sline in sm.stdout.split('\n'):
if 'Serial Number' in sline or 'Serial number' in sline:
obs_serial = sline.split(':')[-1].strip()
break
except Exception:
pass
health_persistence.record_disk_observation(
device_name=disk_name,
serial=obs_serial,
error_type=obs_type,
error_signature=obs_sig,
raw_message=f'/dev/{disk_name}: {reason}',
severity=severity.lower(),
)
except Exception:
pass
except subprocess.TimeoutExpired:
print("[HealthMonitor] journalctl timed out in _check_disk_health_from_events")
except Exception as e:
print(f"[HealthMonitor] Error checking disk health from events: {e}")
# Return empty dict on error, as this check isn't system-critical itself
pass
return disk_issues