mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-18 10:02:16 +00:00
Update notification service
This commit is contained in:
@@ -290,7 +290,7 @@ export function StorageOverview() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const obsTypeLabel = (t: string) =>
|
const obsTypeLabel = (t: string) =>
|
||||||
({ smart_error: 'SMART Error', io_error: 'I/O Error', connection_error: 'Connection Error' }[t] || t)
|
({ smart_error: 'SMART Error', io_error: 'I/O Error', filesystem_error: 'Filesystem Error', zfs_pool_error: 'ZFS Pool Error', connection_error: 'Connection Error' }[t] || t)
|
||||||
|
|
||||||
const getStorageTypeBadge = (type: string) => {
|
const getStorageTypeBadge = (type: string) => {
|
||||||
const typeColors: Record<string, string> = {
|
const typeColors: Record<string, string> = {
|
||||||
|
|||||||
@@ -967,15 +967,96 @@ class HealthMonitor:
|
|||||||
for pool_name, pool_info in zfs_pool_issues.items():
|
for pool_name, pool_info in zfs_pool_issues.items():
|
||||||
issues.append(f'{pool_name}: {pool_info["reason"]}')
|
issues.append(f'{pool_name}: {pool_info["reason"]}')
|
||||||
storage_details[pool_name] = pool_info
|
storage_details[pool_name] = pool_info
|
||||||
|
|
||||||
|
# Record error for notification system
|
||||||
|
real_pool = pool_info.get('pool_name', pool_name)
|
||||||
|
zfs_error_key = f'zfs_pool_{real_pool}'
|
||||||
|
zfs_reason = f'ZFS pool {real_pool}: {pool_info["reason"]}'
|
||||||
|
try:
|
||||||
|
if not health_persistence.is_error_active(zfs_error_key, category='zfs'):
|
||||||
|
health_persistence.record_error(
|
||||||
|
error_key=zfs_error_key,
|
||||||
|
category='zfs',
|
||||||
|
severity=pool_info.get('status', 'WARNING'),
|
||||||
|
reason=zfs_reason,
|
||||||
|
details={
|
||||||
|
'pool_name': real_pool,
|
||||||
|
'health': pool_info.get('health', ''),
|
||||||
|
'device': f'zpool:{real_pool}',
|
||||||
|
'dismissable': False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Record as permanent disk observation
|
||||||
|
try:
|
||||||
|
health_persistence.record_disk_observation(
|
||||||
|
device_name=f'zpool_{real_pool}',
|
||||||
|
serial=None,
|
||||||
|
error_type='zfs_pool_error',
|
||||||
|
error_signature=f'zfs_{real_pool}_{pool_info.get("health", "unknown")}',
|
||||||
|
raw_message=zfs_reason,
|
||||||
|
severity=pool_info.get('status', 'WARNING').lower(),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# ZFS pools are healthy -- clear any previously recorded ZFS errors
|
||||||
|
if self.capabilities.get('has_zfs'):
|
||||||
|
try:
|
||||||
|
active_errors = health_persistence.get_active_errors()
|
||||||
|
for error in active_errors:
|
||||||
|
if error.get('error_key', '').startswith('zfs_pool_'):
|
||||||
|
health_persistence.clear_error(error['error_key'])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Check disk health from Proxmox task log or system logs (SMART, etc.)
|
# Check disk health from Proxmox task log or system logs (SMART, etc.)
|
||||||
disk_health_issues = self._check_disk_health_from_events()
|
disk_health_issues = self._check_disk_health_from_events()
|
||||||
|
smart_warnings_found = False
|
||||||
if disk_health_issues:
|
if disk_health_issues:
|
||||||
for disk, issue in disk_health_issues.items():
|
for disk, issue in disk_health_issues.items():
|
||||||
# Only add if not already covered by critical mountpoint issues
|
# Only add if not already covered by critical mountpoint issues
|
||||||
if disk not in storage_details or storage_details[disk].get('status') == 'OK':
|
if disk not in storage_details or storage_details[disk].get('status') == 'OK':
|
||||||
issues.append(f'{disk}: {issue["reason"]}')
|
issues.append(f'{disk}: {issue["reason"]}')
|
||||||
storage_details[disk] = issue
|
storage_details[disk] = issue
|
||||||
|
|
||||||
|
# Track if any SMART warnings were found (for smart_health sub-check)
|
||||||
|
if issue.get('smart_lines'):
|
||||||
|
smart_warnings_found = True
|
||||||
|
|
||||||
|
# Record error with full details for notification system
|
||||||
|
# Avoid duplicate: if dmesg I/O errors already cover this disk
|
||||||
|
# (disk_{device}), skip the journal SMART notification to prevent
|
||||||
|
# the user getting two alerts for the same underlying problem.
|
||||||
|
device = issue.get('device', disk.replace('/dev/', ''))
|
||||||
|
io_error_key = f'disk_{device}'
|
||||||
|
error_key = f'smart_{device}'
|
||||||
|
reason = f'{disk}: {issue["reason"]}'
|
||||||
|
try:
|
||||||
|
if (not health_persistence.is_error_active(io_error_key, category='disks') and
|
||||||
|
not health_persistence.is_error_active(error_key, category='disks')):
|
||||||
|
health_persistence.record_error(
|
||||||
|
error_key=error_key,
|
||||||
|
category='disks',
|
||||||
|
severity=issue.get('status', 'WARNING'),
|
||||||
|
reason=reason,
|
||||||
|
details={
|
||||||
|
'disk': device,
|
||||||
|
'device': disk,
|
||||||
|
'block_device': device,
|
||||||
|
'serial': '',
|
||||||
|
'smart_status': 'WARNING',
|
||||||
|
'smart_lines': issue.get('smart_lines', []),
|
||||||
|
'io_lines': issue.get('io_lines', []),
|
||||||
|
'sample': issue.get('sample', ''),
|
||||||
|
'source': 'journal',
|
||||||
|
'dismissable': True,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Check LVM status
|
# Check LVM status
|
||||||
lvm_status = self._check_lvm()
|
lvm_status = self._check_lvm()
|
||||||
@@ -1014,7 +1095,16 @@ class HealthMonitor:
|
|||||||
if not has_io:
|
if not has_io:
|
||||||
checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'}
|
checks['io_errors'] = {'status': 'OK', 'detail': 'No I/O errors in dmesg'}
|
||||||
if self.capabilities.get('has_smart') and 'smart_health' not in checks:
|
if self.capabilities.get('has_smart') and 'smart_health' not in checks:
|
||||||
checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
|
if smart_warnings_found:
|
||||||
|
# Collect the actual warning details for the sub-check
|
||||||
|
smart_details_parts = []
|
||||||
|
for disk_path, issue in disk_health_issues.items():
|
||||||
|
for sl in (issue.get('smart_lines') or [])[:3]:
|
||||||
|
smart_details_parts.append(sl)
|
||||||
|
detail_text = '; '.join(smart_details_parts[:3]) if smart_details_parts else 'SMART warning in journal'
|
||||||
|
checks['smart_health'] = {'status': 'WARNING', 'detail': detail_text}
|
||||||
|
else:
|
||||||
|
checks['smart_health'] = {'status': 'OK', 'detail': 'No SMART warnings in journal'}
|
||||||
if self.capabilities.get('has_zfs') and 'zfs_pools' not in checks:
|
if self.capabilities.get('has_zfs') and 'zfs_pools' not in checks:
|
||||||
checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'}
|
checks['zfs_pools'] = {'status': 'OK', 'detail': 'ZFS pools healthy'}
|
||||||
if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks:
|
if self.capabilities.get('has_lvm') and 'lvm_volumes' not in checks and 'lvm_check' not in checks:
|
||||||
@@ -2743,6 +2833,7 @@ class HealthMonitor:
|
|||||||
details={
|
details={
|
||||||
'disk': base_device,
|
'disk': base_device,
|
||||||
'device': f'/dev/{fs_device}',
|
'device': f'/dev/{fs_device}',
|
||||||
|
'block_device': base_device,
|
||||||
'error_type': 'filesystem',
|
'error_type': 'filesystem',
|
||||||
'error_count': 1,
|
'error_count': 1,
|
||||||
'sample': line[:200],
|
'sample': line[:200],
|
||||||
@@ -2751,6 +2842,31 @@ class HealthMonitor:
|
|||||||
'device_exists': device_exists,
|
'device_exists': device_exists,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Record filesystem error as permanent disk observation
|
||||||
|
try:
|
||||||
|
obs_serial = None
|
||||||
|
try:
|
||||||
|
sm = subprocess.run(
|
||||||
|
['smartctl', '-i', f'/dev/{base_device}'],
|
||||||
|
capture_output=True, text=True, timeout=3)
|
||||||
|
if sm.returncode in (0, 4):
|
||||||
|
for sline in sm.stdout.split('\n'):
|
||||||
|
if 'Serial Number' in sline or 'Serial number' in sline:
|
||||||
|
obs_serial = sline.split(':')[-1].strip()
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
health_persistence.record_disk_observation(
|
||||||
|
device_name=base_device,
|
||||||
|
serial=obs_serial,
|
||||||
|
error_type='filesystem_error',
|
||||||
|
error_signature=f'fs_error_{fs_device}_{pattern_key}',
|
||||||
|
raw_message=enriched_reason[:500],
|
||||||
|
severity=fs_severity.lower(),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
recent_patterns[pattern] += 1
|
recent_patterns[pattern] += 1
|
||||||
|
|
||||||
@@ -3654,50 +3770,195 @@ class HealthMonitor:
|
|||||||
def _check_disk_health_from_events(self) -> Dict[str, Any]:
|
def _check_disk_health_from_events(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Check for disk health warnings/errors from system logs (journalctl).
|
Check for disk health warnings/errors from system logs (journalctl).
|
||||||
Looks for SMART warnings and specific disk errors.
|
Looks for SMART warnings, smartd messages, and specific disk errors.
|
||||||
Returns dict of disk issues found.
|
|
||||||
|
Returns dict keyed by '/dev/sdX' with detailed issue info including
|
||||||
|
the actual log lines that triggered the warning, so notifications
|
||||||
|
and the health monitor show actionable information.
|
||||||
"""
|
"""
|
||||||
disk_issues = {}
|
disk_issues: Dict[str, Any] = {}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Check journalctl for warnings/errors related to disks in the last hour
|
# Check journalctl for warnings/errors related to disks in the last hour
|
||||||
|
# Include smartd (SMART daemon) messages explicitly
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['journalctl', '--since', '1 hour ago', '--no-pager', '-p', 'warning'],
|
['journalctl', '--since', '1 hour ago', '--no-pager', '-p', 'warning',
|
||||||
|
'--output=short-precise'],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=3
|
timeout=5
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.returncode == 0:
|
if result.returncode != 0:
|
||||||
for line in result.stdout.split('\n'):
|
return disk_issues
|
||||||
line_lower = line.lower()
|
|
||||||
|
# Collect all relevant lines per disk
|
||||||
|
# disk_lines[disk_name] = {'smart_lines': [], 'io_lines': [], 'severity': 'WARNING'}
|
||||||
|
disk_lines: Dict[str, Dict] = {}
|
||||||
|
|
||||||
|
for line in result.stdout.split('\n'):
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
line_lower = line.lower()
|
||||||
|
|
||||||
|
# Extract disk name -- multiple patterns for different log formats:
|
||||||
|
# /dev/sdh, /dev/nvme0n1
|
||||||
|
# Device: /dev/sdh [SAT] (smartd format)
|
||||||
|
# smartd[1234]: Device: /dev/sdh ...
|
||||||
|
disk_match = re.search(
|
||||||
|
r'(?:/dev/|Device:?\s*/dev/)(sd[a-z]+|nvme\d+n\d+|hd[a-z]+)',
|
||||||
|
line)
|
||||||
|
if not disk_match:
|
||||||
|
# Fallback for smartd messages that reference disk names differently
|
||||||
|
if 'smartd' in line_lower or 'smart' in line_lower:
|
||||||
|
disk_match = re.search(r'\b(sd[a-z]+|nvme\d+n\d+)\b', line)
|
||||||
|
if not disk_match:
|
||||||
|
continue
|
||||||
|
disk_name = disk_match.group(1)
|
||||||
|
|
||||||
|
if disk_name not in disk_lines:
|
||||||
|
disk_lines[disk_name] = {
|
||||||
|
'smart_lines': [], 'io_lines': [],
|
||||||
|
'severity': 'WARNING'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Classify the log line
|
||||||
|
# SMART warnings: smartd messages, SMART attribute warnings, etc.
|
||||||
|
if ('smart' in line_lower and
|
||||||
|
any(kw in line_lower for kw in
|
||||||
|
['warning', 'error', 'fail', 'exceeded', 'threshold',
|
||||||
|
'reallocat', 'pending', 'uncorrect', 'crc', 'offline',
|
||||||
|
'temperature', 'current_pending', 'reported_uncorrect'])):
|
||||||
|
# Extract the meaningful part of the log line (after hostname)
|
||||||
|
msg_part = line.split(': ', 2)[-1] if ': ' in line else line
|
||||||
|
disk_lines[disk_name]['smart_lines'].append(msg_part.strip())
|
||||||
|
|
||||||
|
# smartd daemon messages (e.g. "smartd[1234]: Device: /dev/sdh ...")
|
||||||
|
elif 'smartd' in line_lower:
|
||||||
|
msg_part = line.split(': ', 2)[-1] if ': ' in line else line
|
||||||
|
disk_lines[disk_name]['smart_lines'].append(msg_part.strip())
|
||||||
|
|
||||||
|
# Disk I/O / medium errors
|
||||||
|
elif any(kw in line_lower for kw in
|
||||||
|
['disk error', 'ata error', 'medium error', 'io error',
|
||||||
|
'i/o error', 'blk_update_request', 'sense key']):
|
||||||
|
msg_part = line.split(': ', 2)[-1] if ': ' in line else line
|
||||||
|
disk_lines[disk_name]['io_lines'].append(msg_part.strip())
|
||||||
|
disk_lines[disk_name]['severity'] = 'CRITICAL'
|
||||||
|
|
||||||
|
# Build issues with detailed reasons
|
||||||
|
for disk_name, info in disk_lines.items():
|
||||||
|
dev_path = f'/dev/{disk_name}'
|
||||||
|
smart_lines = info['smart_lines']
|
||||||
|
io_lines = info['io_lines']
|
||||||
|
severity = info['severity']
|
||||||
|
|
||||||
|
if not smart_lines and not io_lines:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Build a descriptive reason from the actual log entries
|
||||||
|
# Deduplicate similar messages (keep unique ones)
|
||||||
|
seen_msgs = set()
|
||||||
|
unique_smart = []
|
||||||
|
for msg in smart_lines:
|
||||||
|
# Normalize for dedup: strip timestamps and volatile parts
|
||||||
|
norm = re.sub(r'\d{4}-\d{2}-\d{2}|\d{2}:\d{2}:\d{2}', '', msg).strip()
|
||||||
|
if norm not in seen_msgs:
|
||||||
|
seen_msgs.add(norm)
|
||||||
|
unique_smart.append(msg)
|
||||||
|
|
||||||
|
unique_io = []
|
||||||
|
for msg in io_lines:
|
||||||
|
norm = re.sub(r'\d{4}-\d{2}-\d{2}|\d{2}:\d{2}:\d{2}', '', msg).strip()
|
||||||
|
if norm not in seen_msgs:
|
||||||
|
seen_msgs.add(norm)
|
||||||
|
unique_io.append(msg)
|
||||||
|
|
||||||
|
# Compose the reason with actual details
|
||||||
|
parts = []
|
||||||
|
if unique_smart:
|
||||||
|
if len(unique_smart) == 1:
|
||||||
|
parts.append(unique_smart[0])
|
||||||
|
else:
|
||||||
|
parts.append(f'{len(unique_smart)} SMART warnings')
|
||||||
|
# Include the first 3 most relevant entries
|
||||||
|
for entry in unique_smart[:3]:
|
||||||
|
parts.append(f' - {entry}')
|
||||||
|
|
||||||
|
if unique_io:
|
||||||
|
if len(unique_io) == 1:
|
||||||
|
parts.append(unique_io[0])
|
||||||
|
else:
|
||||||
|
parts.append(f'{len(unique_io)} I/O errors')
|
||||||
|
for entry in unique_io[:3]:
|
||||||
|
parts.append(f' - {entry}')
|
||||||
|
|
||||||
|
reason = '\n'.join(parts) if parts else 'SMART/disk warning in system logs'
|
||||||
|
|
||||||
|
# Keep first sample line for observation recording
|
||||||
|
sample_line = (unique_smart[0] if unique_smart else
|
||||||
|
unique_io[0] if unique_io else '')
|
||||||
|
|
||||||
|
disk_issues[dev_path] = {
|
||||||
|
'status': severity,
|
||||||
|
'reason': reason,
|
||||||
|
'device': disk_name,
|
||||||
|
'smart_lines': unique_smart[:5],
|
||||||
|
'io_lines': unique_io[:5],
|
||||||
|
'sample': sample_line,
|
||||||
|
'source': 'journal',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Record as disk observation for the permanent history
|
||||||
|
try:
|
||||||
|
obs_type = 'smart_error' if unique_smart else 'io_error'
|
||||||
|
# Build a stable signature from the error family, not the volatile details
|
||||||
|
if unique_smart:
|
||||||
|
sig_base = 'smart_journal'
|
||||||
|
# Classify SMART warnings by type
|
||||||
|
all_text = ' '.join(unique_smart).lower()
|
||||||
|
if any(kw in all_text for kw in ['reallocat', 'pending', 'uncorrect']):
|
||||||
|
sig_base = 'smart_sector_issues'
|
||||||
|
elif 'temperature' in all_text:
|
||||||
|
sig_base = 'smart_temperature'
|
||||||
|
elif 'crc' in all_text or 'udma' in all_text:
|
||||||
|
sig_base = 'smart_crc_errors'
|
||||||
|
elif 'fail' in all_text:
|
||||||
|
sig_base = 'smart_test_failed'
|
||||||
|
else:
|
||||||
|
sig_base = 'journal_io_error'
|
||||||
|
|
||||||
# Check for SMART warnings/errors
|
obs_sig = f'{sig_base}_{disk_name}'
|
||||||
if 'smart' in line_lower and ('warning' in line_lower or 'error' in line_lower or 'fail' in line_lower):
|
|
||||||
# Extract disk name using regex for common disk identifiers
|
|
||||||
disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+|hd\d+)', line)
|
|
||||||
if disk_match:
|
|
||||||
disk_name = disk_match.group(1)
|
|
||||||
# Prioritize CRITICAL if already warned, otherwise set to WARNING
|
|
||||||
if disk_name not in disk_issues or disk_issues[f'/dev/{disk_name}']['status'] != 'CRITICAL':
|
|
||||||
disk_issues[f'/dev/{disk_name}'] = {
|
|
||||||
'status': 'WARNING',
|
|
||||||
'reason': 'SMART warning detected'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check for specific disk I/O or medium errors
|
# Try to get serial for proper cross-referencing
|
||||||
if any(keyword in line_lower for keyword in ['disk error', 'ata error', 'medium error', 'io error']):
|
obs_serial = None
|
||||||
disk_match = re.search(r'/dev/(sd[a-z]|nvme\d+n\d+|hd\d+)', line)
|
try:
|
||||||
if disk_match:
|
sm = subprocess.run(
|
||||||
disk_name = disk_match.group(1)
|
['smartctl', '-i', dev_path],
|
||||||
disk_issues[f'/dev/{disk_name}'] = {
|
capture_output=True, text=True, timeout=3)
|
||||||
'status': 'CRITICAL',
|
if sm.returncode in (0, 4):
|
||||||
'reason': 'Disk error detected'
|
for sline in sm.stdout.split('\n'):
|
||||||
}
|
if 'Serial Number' in sline or 'Serial number' in sline:
|
||||||
|
obs_serial = sline.split(':')[-1].strip()
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
health_persistence.record_disk_observation(
|
||||||
|
device_name=disk_name,
|
||||||
|
serial=obs_serial,
|
||||||
|
error_type=obs_type,
|
||||||
|
error_signature=obs_sig,
|
||||||
|
raw_message=f'/dev/{disk_name}: {reason}',
|
||||||
|
severity=severity.lower(),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print("[HealthMonitor] journalctl timed out in _check_disk_health_from_events")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[HealthMonitor] Error checking disk health from events: {e}")
|
print(f"[HealthMonitor] Error checking disk health from events: {e}")
|
||||||
# Return empty dict on error, as this check isn't system-critical itself
|
|
||||||
pass
|
|
||||||
|
|
||||||
return disk_issues
|
return disk_issues
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user