mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-18 10:02:16 +00:00
Update notification service
This commit is contained in:
@@ -1045,6 +1045,150 @@ _system_info_cache = {
|
|||||||
}
|
}
|
||||||
_SYSTEM_INFO_CACHE_TTL = 21600 # 6 hours - update notifications are sent once per 24h
|
_SYSTEM_INFO_CACHE_TTL = 21600 # 6 hours - update notifications are sent once per 24h
|
||||||
|
|
||||||
|
# Cache for pvesh cluster resources (reduces repeated API calls)
|
||||||
|
_pvesh_cache = {
|
||||||
|
'cluster_resources_vm': None,
|
||||||
|
'cluster_resources_vm_time': 0,
|
||||||
|
'cluster_resources_storage': None,
|
||||||
|
'cluster_resources_storage_time': 0,
|
||||||
|
'storage_list': None,
|
||||||
|
'storage_list_time': 0,
|
||||||
|
}
|
||||||
|
_PVESH_CACHE_TTL = 30 # 30 seconds - balances freshness with performance
|
||||||
|
|
||||||
|
# Cache for sensors output (temperature readings)
|
||||||
|
_sensors_cache = {
|
||||||
|
'output': None,
|
||||||
|
'time': 0,
|
||||||
|
}
|
||||||
|
_SENSORS_CACHE_TTL = 10 # 10 seconds - temperature changes slowly
|
||||||
|
|
||||||
|
# Cache for hardware info (lspci, dmidecode, lsblk)
|
||||||
|
_hardware_cache = {
|
||||||
|
'lspci': None,
|
||||||
|
'lspci_time': 0,
|
||||||
|
'dmidecode': None,
|
||||||
|
'dmidecode_time': 0,
|
||||||
|
'lsblk': None,
|
||||||
|
'lsblk_time': 0,
|
||||||
|
}
|
||||||
|
_HARDWARE_CACHE_TTL = 300 # 5 minutes - hardware doesn't change
|
||||||
|
|
||||||
|
|
||||||
|
def get_cached_pvesh_cluster_resources_vm():
|
||||||
|
"""Get cluster VM resources with 30s cache."""
|
||||||
|
global _pvesh_cache
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
if _pvesh_cache['cluster_resources_vm'] is not None and \
|
||||||
|
now - _pvesh_cache['cluster_resources_vm_time'] < _PVESH_CACHE_TTL:
|
||||||
|
return _pvesh_cache['cluster_resources_vm']
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'],
|
||||||
|
capture_output=True, text=True, timeout=10
|
||||||
|
)
|
||||||
|
if result.returncode == 0:
|
||||||
|
data = json.loads(result.stdout)
|
||||||
|
_pvesh_cache['cluster_resources_vm'] = data
|
||||||
|
_pvesh_cache['cluster_resources_vm_time'] = now
|
||||||
|
return data
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return _pvesh_cache['cluster_resources_vm'] or []
|
||||||
|
|
||||||
|
|
||||||
|
def get_cached_sensors_output():
|
||||||
|
"""Get sensors output with 10s cache."""
|
||||||
|
global _sensors_cache
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
if _sensors_cache['output'] is not None and \
|
||||||
|
now - _sensors_cache['time'] < _SENSORS_CACHE_TTL:
|
||||||
|
return _sensors_cache['output']
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=5)
|
||||||
|
if result.returncode == 0:
|
||||||
|
_sensors_cache['output'] = result.stdout
|
||||||
|
_sensors_cache['time'] = now
|
||||||
|
return result.stdout
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return _sensors_cache['output'] or ''
|
||||||
|
|
||||||
|
|
||||||
|
def get_cached_lspci():
|
||||||
|
"""Get lspci output with 5 minute cache."""
|
||||||
|
global _hardware_cache
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
if _hardware_cache['lspci'] is not None and \
|
||||||
|
now - _hardware_cache['lspci_time'] < _HARDWARE_CACHE_TTL:
|
||||||
|
return _hardware_cache['lspci']
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(['lspci'], capture_output=True, text=True, timeout=10)
|
||||||
|
if result.returncode == 0:
|
||||||
|
_hardware_cache['lspci'] = result.stdout
|
||||||
|
_hardware_cache['lspci_time'] = now
|
||||||
|
return result.stdout
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return _hardware_cache['lspci'] or ''
|
||||||
|
|
||||||
|
|
||||||
|
def get_cached_lspci_vmm():
|
||||||
|
"""Get lspci -vmm output with 5 minute cache."""
|
||||||
|
global _hardware_cache
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
cache_key = 'lspci_vmm'
|
||||||
|
if cache_key not in _hardware_cache:
|
||||||
|
_hardware_cache[cache_key] = None
|
||||||
|
_hardware_cache[cache_key + '_time'] = 0
|
||||||
|
|
||||||
|
if _hardware_cache[cache_key] is not None and \
|
||||||
|
now - _hardware_cache[cache_key + '_time'] < _HARDWARE_CACHE_TTL:
|
||||||
|
return _hardware_cache[cache_key]
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(['lspci', '-vmm'], capture_output=True, text=True, timeout=10)
|
||||||
|
if result.returncode == 0:
|
||||||
|
_hardware_cache[cache_key] = result.stdout
|
||||||
|
_hardware_cache[cache_key + '_time'] = now
|
||||||
|
return result.stdout
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return _hardware_cache[cache_key] or ''
|
||||||
|
|
||||||
|
|
||||||
|
def get_cached_lspci_k():
|
||||||
|
"""Get lspci -k output with 5 minute cache."""
|
||||||
|
global _hardware_cache
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
cache_key = 'lspci_k'
|
||||||
|
if cache_key not in _hardware_cache:
|
||||||
|
_hardware_cache[cache_key] = None
|
||||||
|
_hardware_cache[cache_key + '_time'] = 0
|
||||||
|
|
||||||
|
if _hardware_cache[cache_key] is not None and \
|
||||||
|
now - _hardware_cache[cache_key + '_time'] < _HARDWARE_CACHE_TTL:
|
||||||
|
return _hardware_cache[cache_key]
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(['lspci', '-k'], capture_output=True, text=True, timeout=10)
|
||||||
|
if result.returncode == 0:
|
||||||
|
_hardware_cache[cache_key] = result.stdout
|
||||||
|
_hardware_cache[cache_key + '_time'] = now
|
||||||
|
return result.stdout
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return _hardware_cache[cache_key] or ''
|
||||||
|
|
||||||
|
|
||||||
def get_proxmox_version():
|
def get_proxmox_version():
|
||||||
"""Get Proxmox version if available. Cached for 6 hours."""
|
"""Get Proxmox version if available. Cached for 6 hours."""
|
||||||
global _system_info_cache
|
global _system_info_cache
|
||||||
@@ -1237,11 +1381,8 @@ def get_vm_lxc_names():
|
|||||||
# local_node = socket.gethostname()
|
# local_node = socket.gethostname()
|
||||||
local_node = get_proxmox_node_name()
|
local_node = get_proxmox_node_name()
|
||||||
|
|
||||||
result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'],
|
resources = get_cached_pvesh_cluster_resources_vm()
|
||||||
capture_output=True, text=True, timeout=10)
|
if resources:
|
||||||
|
|
||||||
if result.returncode == 0:
|
|
||||||
resources = json.loads(result.stdout)
|
|
||||||
for resource in resources:
|
for resource in resources:
|
||||||
node = resource.get('node', '')
|
node = resource.get('node', '')
|
||||||
if node != local_node:
|
if node != local_node:
|
||||||
@@ -3247,18 +3388,15 @@ def get_proxmox_vms():
|
|||||||
# local_node = socket.gethostname()
|
# local_node = socket.gethostname()
|
||||||
local_node = get_proxmox_node_name()
|
local_node = get_proxmox_node_name()
|
||||||
|
|
||||||
# print(f"[v0] Local node detected: {local_node}")
|
# print(f"[v0] Local node detected: {local_node}")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'],
|
resources = get_cached_pvesh_cluster_resources_vm()
|
||||||
capture_output=True, text=True, timeout=10)
|
if resources:
|
||||||
|
for resource in resources:
|
||||||
if result.returncode == 0:
|
node = resource.get('node', '')
|
||||||
resources = json.loads(result.stdout)
|
if node != local_node:
|
||||||
for resource in resources:
|
# print(f"[v0] Skipping VM {resource.get('vmid')} from remote node: {node}")
|
||||||
node = resource.get('node', '')
|
|
||||||
if node != local_node:
|
|
||||||
# print(f"[v0] Skipping VM {resource.get('vmid')} from remote node: {node}")
|
|
||||||
pass
|
pass
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -3696,13 +3834,13 @@ def get_temperature_info():
|
|||||||
power_meter = None
|
power_meter = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=5)
|
sensors_output = get_cached_sensors_output()
|
||||||
if result.returncode == 0:
|
if sensors_output:
|
||||||
current_adapter = None
|
current_adapter = None
|
||||||
current_chip = None
|
current_chip = None
|
||||||
current_sensor = None
|
current_sensor = None
|
||||||
|
|
||||||
for line in result.stdout.split('\n'):
|
for line in sensors_output.split('\n'):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
@@ -4931,9 +5069,9 @@ def get_gpu_info():
|
|||||||
gpus = []
|
gpus = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(['lspci'], capture_output=True, text=True, timeout=5)
|
lspci_output = get_cached_lspci()
|
||||||
if result.returncode == 0:
|
if lspci_output:
|
||||||
for line in result.stdout.split('\n'):
|
for line in lspci_output.split('\n'):
|
||||||
# Match VGA, 3D, Display controllers
|
# Match VGA, 3D, Display controllers
|
||||||
if any(keyword in line for keyword in ['VGA compatible controller', '3D controller', 'Display controller']):
|
if any(keyword in line for keyword in ['VGA compatible controller', '3D controller', 'Display controller']):
|
||||||
|
|
||||||
@@ -4984,11 +5122,11 @@ def get_gpu_info():
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=5)
|
sensors_output = get_cached_sensors_output()
|
||||||
if result.returncode == 0:
|
if sensors_output:
|
||||||
current_adapter = None
|
current_adapter = None
|
||||||
|
|
||||||
for line in result.stdout.split('\n'):
|
for line in sensors_output.split('\n'):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
@@ -5399,9 +5537,9 @@ def get_hardware_info():
|
|||||||
})
|
})
|
||||||
|
|
||||||
# Always check lspci for all GPUs (integrated and discrete)
|
# Always check lspci for all GPUs (integrated and discrete)
|
||||||
result = subprocess.run(['lspci'], capture_output=True, text=True, timeout=5)
|
lspci_output = get_cached_lspci()
|
||||||
if result.returncode == 0:
|
if lspci_output:
|
||||||
for line in result.stdout.split('\n'):
|
for line in lspci_output.split('\n'):
|
||||||
# Match VGA, 3D, Display controllers
|
# Match VGA, 3D, Display controllers
|
||||||
if any(keyword in line for keyword in ['VGA compatible controller', '3D controller', 'Display controller']):
|
if any(keyword in line for keyword in ['VGA compatible controller', '3D controller', 'Display controller']):
|
||||||
parts = line.split(':', 2)
|
parts = line.split(':', 2)
|
||||||
@@ -5453,10 +5591,10 @@ def get_hardware_info():
|
|||||||
# print("[v0] Getting PCI devices with driver information...")
|
# print("[v0] Getting PCI devices with driver information...")
|
||||||
pass
|
pass
|
||||||
# First get basic device info with lspci -vmm
|
# First get basic device info with lspci -vmm
|
||||||
result = subprocess.run(['lspci', '-vmm'], capture_output=True, text=True, timeout=10)
|
lspci_vmm_output = get_cached_lspci_vmm()
|
||||||
if result.returncode == 0:
|
if lspci_vmm_output:
|
||||||
current_device = {}
|
current_device = {}
|
||||||
for line in result.stdout.split('\n'):
|
for line in lspci_vmm_output.split('\n'):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
|
||||||
if not line:
|
if not line:
|
||||||
@@ -5523,13 +5661,13 @@ def get_hardware_info():
|
|||||||
current_device[key.strip()] = value.strip()
|
current_device[key.strip()] = value.strip()
|
||||||
|
|
||||||
# Now get driver information with lspci -k
|
# Now get driver information with lspci -k
|
||||||
result_k = subprocess.run(['lspci', '-k'], capture_output=True, text=True, timeout=10)
|
lspci_k_output = get_cached_lspci_k()
|
||||||
if result_k.returncode == 0:
|
if lspci_k_output:
|
||||||
current_slot = None
|
current_slot = None
|
||||||
current_driver = None
|
current_driver = None
|
||||||
current_module = None
|
current_module = None
|
||||||
|
|
||||||
for line in result_k.stdout.split('\n'):
|
for line in lspci_k_output.split('\n'):
|
||||||
# Match PCI slot line (e.g., "00:1f.2 SATA controller: ...")
|
# Match PCI slot line (e.g., "00:1f.2 SATA controller: ...")
|
||||||
if line and not line.startswith('\t'):
|
if line and not line.startswith('\t'):
|
||||||
parts = line.split(' ', 1)
|
parts = line.split(' ', 1)
|
||||||
@@ -5579,17 +5717,17 @@ def get_hardware_info():
|
|||||||
'critical': entry.critical if entry.critical else 0
|
'critical': entry.critical if entry.critical else 0
|
||||||
})
|
})
|
||||||
|
|
||||||
# print(f"[v0] Temperature sensors: {len(hardware_data['sensors']['temperatures'])} found")
|
# print(f"[v0] Temperature sensors: {len(hardware_data['sensors']['temperatures'])} found")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=5)
|
sensors_output = get_cached_sensors_output()
|
||||||
if result.returncode == 0:
|
if sensors_output:
|
||||||
current_adapter = None
|
current_adapter = None
|
||||||
current_chip = None # Add chip name tracking
|
current_chip = None # Add chip name tracking
|
||||||
fans = []
|
fans = []
|
||||||
|
|
||||||
for line in result.stdout.split('\n'):
|
for line in sensors_output.split('\n'):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
@@ -6634,10 +6772,8 @@ def api_create_backup(vmid):
|
|||||||
|
|
||||||
# Try to find VM in cluster resources
|
# Try to find VM in cluster resources
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'],
|
vms = get_cached_pvesh_cluster_resources_vm()
|
||||||
capture_output=True, text=True, timeout=10)
|
if vms:
|
||||||
if result.returncode == 0:
|
|
||||||
vms = json.loads(result.stdout)
|
|
||||||
for vm in vms:
|
for vm in vms:
|
||||||
if vm.get('vmid') == vmid:
|
if vm.get('vmid') == vmid:
|
||||||
node = vm.get('node')
|
node = vm.get('node')
|
||||||
@@ -7484,11 +7620,9 @@ def api_vm_logs(vmid):
|
|||||||
"""Download real logs for a specific VM/LXC (not task history)"""
|
"""Download real logs for a specific VM/LXC (not task history)"""
|
||||||
try:
|
try:
|
||||||
# Get VM type and node
|
# Get VM type and node
|
||||||
result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'],
|
resources = get_cached_pvesh_cluster_resources_vm()
|
||||||
capture_output=True, text=True, timeout=10)
|
|
||||||
|
|
||||||
if result.returncode == 0:
|
if resources:
|
||||||
resources = json.loads(result.stdout)
|
|
||||||
vm_info = None
|
vm_info = None
|
||||||
for resource in resources:
|
for resource in resources:
|
||||||
if resource.get('vmid') == vmid:
|
if resource.get('vmid') == vmid:
|
||||||
@@ -7536,17 +7670,15 @@ def api_vm_control(vmid):
|
|||||||
data = request.get_json()
|
data = request.get_json()
|
||||||
action = data.get('action') # start, stop, shutdown, reboot
|
action = data.get('action') # start, stop, shutdown, reboot
|
||||||
|
|
||||||
if action not in ['start', 'stop', 'shutdown', 'reboot']:
|
if action not in ['start', 'stop', 'shutdown', 'reboot']:
|
||||||
return jsonify({'error': 'Invalid action'}), 400
|
return jsonify({'error': 'Invalid action'}), 400
|
||||||
|
|
||||||
# Get VM type and node
|
# Get VM type and node
|
||||||
result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'],
|
resources = get_cached_pvesh_cluster_resources_vm()
|
||||||
capture_output=True, text=True, timeout=10)
|
|
||||||
|
if resources:
|
||||||
if result.returncode == 0:
|
vm_info = None
|
||||||
resources = json.loads(result.stdout)
|
for resource in resources:
|
||||||
vm_info = None
|
|
||||||
for resource in resources:
|
|
||||||
if resource.get('vmid') == vmid:
|
if resource.get('vmid') == vmid:
|
||||||
vm_info = resource
|
vm_info = resource
|
||||||
break
|
break
|
||||||
@@ -7590,11 +7722,9 @@ def api_vm_config_update(vmid):
|
|||||||
description = data.get('description', '')
|
description = data.get('description', '')
|
||||||
|
|
||||||
# Get VM type and node
|
# Get VM type and node
|
||||||
result = subprocess.run(['pvesh', 'get', '/cluster/resources', '--type', 'vm', '--output-format', 'json'],
|
resources = get_cached_pvesh_cluster_resources_vm()
|
||||||
capture_output=True, text=True, timeout=10)
|
|
||||||
|
|
||||||
if result.returncode == 0:
|
if resources:
|
||||||
resources = json.loads(result.stdout)
|
|
||||||
vm_info = None
|
vm_info = None
|
||||||
for resource in resources:
|
for resource in resources:
|
||||||
if resource.get('vmid') == vmid:
|
if resource.get('vmid') == vmid:
|
||||||
|
|||||||
@@ -215,6 +215,14 @@ class HealthMonitor:
|
|||||||
self._unknown_counts = {} # Track consecutive UNKNOWN cycles per category
|
self._unknown_counts = {} # Track consecutive UNKNOWN cycles per category
|
||||||
self._last_cleanup_time = 0 # Throttle cleanup_old_errors calls
|
self._last_cleanup_time = 0 # Throttle cleanup_old_errors calls
|
||||||
|
|
||||||
|
# SMART check cache - reduces disk queries from every 5 min to every 30 min
|
||||||
|
self._smart_cache = {} # {disk_name: {'result': 'PASSED', 'time': timestamp}}
|
||||||
|
self._SMART_CACHE_TTL = 1800 # 30 minutes - disk health changes slowly
|
||||||
|
|
||||||
|
# Journalctl 24h cache - reduces full log reads from every 5 min to every 1 hour
|
||||||
|
self._journalctl_24h_cache = {'count': 0, 'time': 0}
|
||||||
|
self._JOURNALCTL_24H_CACHE_TTL = 3600 # 1 hour - login attempts aggregate slowly
|
||||||
|
|
||||||
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
|
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
|
||||||
# SMART detection still uses filesystem check on init (lightweight)
|
# SMART detection still uses filesystem check on init (lightweight)
|
||||||
has_smart = os.path.exists('/usr/sbin/smartctl') or os.path.exists('/usr/bin/smartctl')
|
has_smart = os.path.exists('/usr/sbin/smartctl') or os.path.exists('/usr/bin/smartctl')
|
||||||
@@ -1758,9 +1766,20 @@ class HealthMonitor:
|
|||||||
return ''
|
return ''
|
||||||
|
|
||||||
def _quick_smart_health(self, disk_name: str) -> str:
|
def _quick_smart_health(self, disk_name: str) -> str:
|
||||||
"""Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'."""
|
"""Quick SMART health check for a single disk. Returns 'PASSED', 'FAILED', or 'UNKNOWN'.
|
||||||
|
|
||||||
|
Results are cached for 30 minutes to reduce disk queries - SMART status rarely changes.
|
||||||
|
"""
|
||||||
if not disk_name or disk_name.startswith('ata') or disk_name.startswith('zram'):
|
if not disk_name or disk_name.startswith('ata') or disk_name.startswith('zram'):
|
||||||
return 'UNKNOWN'
|
return 'UNKNOWN'
|
||||||
|
|
||||||
|
# Check cache first
|
||||||
|
current_time = time.time()
|
||||||
|
cache_key = disk_name
|
||||||
|
cached = self._smart_cache.get(cache_key)
|
||||||
|
if cached and current_time - cached['time'] < self._SMART_CACHE_TTL:
|
||||||
|
return cached['result']
|
||||||
|
|
||||||
try:
|
try:
|
||||||
dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name
|
dev_path = f'/dev/{disk_name}' if not disk_name.startswith('/') else disk_name
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
@@ -1771,10 +1790,15 @@ class HealthMonitor:
|
|||||||
data = _json.loads(result.stdout)
|
data = _json.loads(result.stdout)
|
||||||
passed = data.get('smart_status', {}).get('passed', None)
|
passed = data.get('smart_status', {}).get('passed', None)
|
||||||
if passed is True:
|
if passed is True:
|
||||||
return 'PASSED'
|
smart_result = 'PASSED'
|
||||||
elif passed is False:
|
elif passed is False:
|
||||||
return 'FAILED'
|
smart_result = 'FAILED'
|
||||||
return 'UNKNOWN'
|
else:
|
||||||
|
smart_result = 'UNKNOWN'
|
||||||
|
|
||||||
|
# Cache the result
|
||||||
|
self._smart_cache[cache_key] = {'result': smart_result, 'time': current_time}
|
||||||
|
return smart_result
|
||||||
except Exception:
|
except Exception:
|
||||||
return 'UNKNOWN'
|
return 'UNKNOWN'
|
||||||
|
|
||||||
@@ -3960,24 +3984,36 @@ class HealthMonitor:
|
|||||||
issues.append(cert_reason or 'Certificate issue')
|
issues.append(cert_reason or 'Certificate issue')
|
||||||
|
|
||||||
# Sub-check 3: Failed login attempts (brute force detection)
|
# Sub-check 3: Failed login attempts (brute force detection)
|
||||||
|
# Cached for 1 hour to avoid reading 24h of logs every 5 minutes
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
current_time = time.time()
|
||||||
['journalctl', '--since', '24 hours ago', '--no-pager',
|
|
||||||
'-g', 'authentication failure|failed password|invalid user',
|
|
||||||
'--output=cat', '-n', '5000'],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
timeout=20
|
|
||||||
)
|
|
||||||
|
|
||||||
failed_logins = 0
|
# Check if we have a valid cached result
|
||||||
if result.returncode == 0:
|
if self._journalctl_24h_cache['time'] > 0 and \
|
||||||
for line in result.stdout.split('\n'):
|
current_time - self._journalctl_24h_cache['time'] < self._JOURNALCTL_24H_CACHE_TTL:
|
||||||
line_lower = line.lower()
|
failed_logins = self._journalctl_24h_cache['count']
|
||||||
if 'authentication failure' in line_lower or 'failed password' in line_lower or 'invalid user' in line_lower:
|
else:
|
||||||
failed_logins += 1
|
# Cache expired or first run - read full 24h logs
|
||||||
|
result = subprocess.run(
|
||||||
|
['journalctl', '--since', '24 hours ago', '--no-pager',
|
||||||
|
'-g', 'authentication failure|failed password|invalid user',
|
||||||
|
'--output=cat', '-n', '5000'],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=20
|
||||||
|
)
|
||||||
|
|
||||||
if failed_logins > 50:
|
failed_logins = 0
|
||||||
|
if result.returncode == 0:
|
||||||
|
for line in result.stdout.split('\n'):
|
||||||
|
line_lower = line.lower()
|
||||||
|
if 'authentication failure' in line_lower or 'failed password' in line_lower or 'invalid user' in line_lower:
|
||||||
|
failed_logins += 1
|
||||||
|
|
||||||
|
# Cache the result
|
||||||
|
self._journalctl_24h_cache = {'count': failed_logins, 'time': current_time}
|
||||||
|
|
||||||
|
if failed_logins > 50:
|
||||||
msg = f'{failed_logins} failed login attempts in 24h'
|
msg = f'{failed_logins} failed login attempts in 24h'
|
||||||
issues.append(msg)
|
issues.append(msg)
|
||||||
checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins, 'dismissable': True}
|
checks['login_attempts'] = {'status': 'WARNING', 'detail': msg, 'count': failed_logins, 'dismissable': True}
|
||||||
|
|||||||
@@ -1396,10 +1396,10 @@ class TaskWatcher:
|
|||||||
# 2. Check active file for newly started tasks (backup start)
|
# 2. Check active file for newly started tasks (backup start)
|
||||||
self._check_active_tasks()
|
self._check_active_tasks()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[TaskWatcher] Error reading task log: {e}")
|
print(f"[TaskWatcher] Error reading task log: {e}")
|
||||||
|
|
||||||
time.sleep(2) # Check every 2 seconds
|
time.sleep(5) # Check every 5 seconds (reduced from 2s for efficiency)
|
||||||
|
|
||||||
def _check_active_tasks(self):
|
def _check_active_tasks(self):
|
||||||
"""Scan /var/log/pve/tasks/active to track vzdump for VM suppression.
|
"""Scan /var/log/pve/tasks/active to track vzdump for VM suppression.
|
||||||
|
|||||||
Reference in New Issue
Block a user