Files
ProxMenux/scripts/gpu_tpu/nvidia_update.sh
2026-04-01 23:09:51 +02:00

495 lines
16 KiB
Bash

#!/bin/bash
# ProxMenux - NVIDIA Driver Updater (Host + LXC)
# ================================================
# Author : MacRimi
# License : MIT
# Version : 1.0
# Last Updated: 01/04/2026
# ================================================
LOCAL_SCRIPTS="/usr/local/share/proxmenux/scripts"
BASE_DIR="/usr/local/share/proxmenux"
UTILS_FILE="$BASE_DIR/utils.sh"
LOG_FILE="/tmp/nvidia_update.log"
NVIDIA_BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64"
NVIDIA_WORKDIR="/opt/nvidia"
if [[ -f "$UTILS_FILE" ]]; then
source "$UTILS_FILE"
fi
load_language
initialize_cache
# ============================================================
# Host NVIDIA state detection
# ============================================================
detect_host_nvidia() {
HOST_NVIDIA_VERSION=""
HOST_NVIDIA_READY=false
if lsmod | grep -q "^nvidia " && command -v nvidia-smi >/dev/null 2>&1; then
HOST_NVIDIA_VERSION=$(nvidia-smi --query-gpu=driver_version \
--format=csv,noheader 2>/dev/null | head -n1 | tr -d '[:space:]')
[[ -n "$HOST_NVIDIA_VERSION" ]] && HOST_NVIDIA_READY=true
fi
if ! $HOST_NVIDIA_READY; then
dialog --backtitle "ProxMenux" \
--title "$(translate 'NVIDIA Not Found')" \
--msgbox "\n$(translate 'No NVIDIA driver is currently loaded on this host.')\n\n$(translate 'Please install NVIDIA drivers first using the option:')\n\n $(translate 'Install NVIDIA Drivers on Host')\n\n$(translate 'from this same GPU and TPU menu.')" \
13 72
exit 0
fi
}
# ============================================================
# LXC containers with NVIDIA passthrough
# ============================================================
find_nvidia_containers() {
NVIDIA_CONTAINERS=()
for conf in /etc/pve/lxc/*.conf; do
[[ -f "$conf" ]] || continue
if grep -qiE "dev[0-9]+:.*nvidia" "$conf"; then
NVIDIA_CONTAINERS+=("$(basename "$conf" .conf)")
fi
done
}
get_lxc_nvidia_version() {
local ctid="$1"
local version=""
# Prefer nvidia-smi when the container is running (works with .run-installed drivers)
if pct status "$ctid" 2>/dev/null | grep -q "running"; then
version=$(pct exec "$ctid" -- nvidia-smi \
--query-gpu=driver_version --format=csv,noheader 2>/dev/null \
| head -1 | tr -d '[:space:]' || true)
fi
# Fallback: dpkg status for apt-installed libcuda1 (dir-type storage, no start needed)
if [[ -z "$version" ]]; then
local rootfs="/var/lib/lxc/${ctid}/rootfs"
if [[ -f "${rootfs}/var/lib/dpkg/status" ]]; then
version=$(grep -A5 "^Package: libcuda1$" "${rootfs}/var/lib/dpkg/status" \
| grep "^Version:" | head -1 | awk '{print $2}' | cut -d- -f1)
fi
fi
echo "${version:-$(translate 'not installed')}"
}
# ============================================================
# Version list from NVIDIA servers
# ============================================================
list_available_versions() {
local html
html=$(curl -s --connect-timeout 15 "${NVIDIA_BASE_URL}/" 2>/dev/null) || true
if [[ -z "$html" ]]; then
echo ""
return 1
fi
echo "$html" \
| grep -o 'href=[^ >]*' \
| awk -F"'" '{print $2}' \
| grep -E '^[0-9]' \
| sed 's/\/$//' \
| sed "s/^[[:space:]]*//;s/[[:space:]]*$//" \
| sort -Vr \
| uniq
}
get_latest_version() {
local latest_line
latest_line=$(curl -fsSL --connect-timeout 15 "${NVIDIA_BASE_URL}/latest.txt" 2>/dev/null) || true
echo "$latest_line" | awk '{print $1}' | tr -d '[:space:]'
}
# ============================================================
# Version selection menu
# ============================================================
select_target_version() {
msg_info "$(translate 'Fetching available NVIDIA versions...')"
local latest versions_list
latest=$(get_latest_version 2>/dev/null)
versions_list=$(list_available_versions 2>/dev/null)
msg_ok "$(translate 'Version list retrieved.')"
if [[ -z "$latest" && -z "$versions_list" ]]; then
dialog --backtitle "ProxMenux" \
--title "$(translate 'Error')" \
--msgbox "\n$(translate 'Could not retrieve versions from NVIDIA. Please check your internet connection.')" \
8 72
exit 1
fi
[[ -z "$latest" && -n "$versions_list" ]] && latest=$(echo "$versions_list" | head -1)
[[ -z "$versions_list" ]] && versions_list="$latest"
latest=$(echo "$latest" | tr -d '[:space:]')
local choices=()
choices+=("latest" "$(translate 'Latest available') (${latest:-?})")
choices+=("" "")
while IFS= read -r ver; do
ver=$(echo "$ver" | tr -d '[:space:]')
[[ -z "$ver" ]] && continue
choices+=("$ver" "$ver")
done <<< "$versions_list"
local menu_text
menu_text="\n$(translate 'Current host version:') ${HOST_NVIDIA_VERSION}\n"
menu_text+="$(translate 'Select the target version to install on host and all affected LXCs:')"
TARGET_VERSION=$(dialog --backtitle "ProxMenux" \
--title "$(translate 'NVIDIA Driver Version')" \
--menu "$menu_text" 26 80 16 \
"${choices[@]}" \
2>&1 >/dev/tty) || exit 0
[[ -z "$TARGET_VERSION" ]] && exit 0
if [[ "$TARGET_VERSION" == "latest" ]]; then
TARGET_VERSION="$latest"
fi
TARGET_VERSION=$(echo "$TARGET_VERSION" | tr -d '[:space:]')
}
# ============================================================
# Update NVIDIA userspace libs inside a single LXC
# ============================================================
update_lxc_nvidia() {
local ctid="$1"
local version="$2"
local was_running=false
# Capture old version before update
local old_version
old_version=$(get_lxc_nvidia_version "$ctid")
if pct status "$ctid" 2>/dev/null | grep -q "running"; then
was_running=true
else
msg_info "$(translate 'Starting container') ${ctid}..."
pct start "$ctid" >>"$LOG_FILE" 2>&1 || true
local ready=false
for _ in {1..15}; do
sleep 2
pct exec "$ctid" -- true >/dev/null 2>&1 && ready=true && break
done
if ! $ready; then
msg_warn "$(translate 'Container') ${ctid} $(translate 'did not start. Skipping.')"
return 1
fi
msg_ok "$(translate 'Container') ${ctid} $(translate 'started.')"
fi
msg_info "$(translate 'Updating NVIDIA libs in container') ${ctid}..."
local run_file="${NVIDIA_WORKDIR}/NVIDIA-Linux-x86_64-${version}.run"
if [[ ! -f "$run_file" ]]; then
msg_warn "$(translate 'Installer not found:') ${run_file}$(translate 'skipping container') ${ctid}"
if [[ "$was_running" == "false" ]]; then pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true; fi
return 1
fi
# Extract .run on the host to avoid decompression failures inside the container
local extract_dir="${NVIDIA_WORKDIR}/extracted_${version}"
local archive="/tmp/nvidia_lxc_${version}.tar.gz"
msg_info "$(translate 'Extracting NVIDIA installer on host...')"
rm -rf "$extract_dir"
if ! sh "$run_file" --extract-only --target "$extract_dir" >>"$LOG_FILE" 2>&1; then
msg_warn "$(translate 'Extraction failed. Check log:') ${LOG_FILE}"
if [[ "$was_running" == "false" ]]; then pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true; fi
return 1
fi
msg_ok "$(translate 'Extracted.')"
msg_info "$(translate 'Packing and copying installer to container') ${ctid}..."
tar -czf "$archive" -C "$extract_dir" . >>"$LOG_FILE" 2>&1
if ! pct push "$ctid" "$archive" /tmp/nvidia_lxc.tar.gz >>"$LOG_FILE" 2>&1; then
msg_warn "$(translate 'pct push failed. Check log:') ${LOG_FILE}"
rm -f "$archive"
if [[ "$was_running" == "false" ]]; then pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true; fi
return 1
fi
rm -f "$archive"
msg_ok "$(translate 'Installer copied to container.')"
msg_info2 "$(translate 'Starting NVIDIA installer in container') ${ctid}. $(translate 'This may take several minutes...')"
echo "" >>"$LOG_FILE"
pct exec "$ctid" -- bash -c "
mkdir -p /tmp/nvidia_lxc_install
tar -xzf /tmp/nvidia_lxc.tar.gz -C /tmp/nvidia_lxc_install 2>&1
/tmp/nvidia_lxc_install/nvidia-installer \
--no-kernel-modules \
--no-questions \
--ui=none \
--no-nouveau-check \
--no-dkms
EXIT=\$?
rm -rf /tmp/nvidia_lxc_install /tmp/nvidia_lxc.tar.gz
exit \$EXIT
" 2>&1 | tee -a "$LOG_FILE"
local rc=${PIPESTATUS[0]}
rm -rf "$extract_dir"
if [[ $rc -ne 0 ]]; then
msg_warn "$(translate 'NVIDIA installer returned error') ${rc}. $(translate 'Check log:') ${LOG_FILE}"
if [[ "$was_running" == "false" ]]; then pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true; fi
return 1
fi
msg_ok "$(translate 'Container') ${ctid}: ${old_version}${version}"
msg_info2 "$(translate 'NVIDIA driver verification in container') ${ctid}:"
pct exec "$ctid" -- nvidia-smi 2>/dev/null || true
if [[ "$was_running" == "false" ]]; then
msg_info "$(translate 'Stopping container') ${ctid}..."
pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true
msg_ok "$(translate 'Container stopped.')"
fi
}
# ============================================================
# Host NVIDIA update
# ============================================================
_stop_nvidia_services() {
for svc in nvidia-persistenced.service nvidia-powerd.service; do
systemctl is-active --quiet "$svc" 2>/dev/null && systemctl stop "$svc" >/dev/null 2>&1 || true
systemctl is-enabled --quiet "$svc" 2>/dev/null && systemctl disable "$svc" >/dev/null 2>&1 || true
done
}
_unload_nvidia_modules() {
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
modprobe -r "$mod" >/dev/null 2>&1 || true
done
# Second pass for stubborn modules
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
modprobe -r --force "$mod" >/dev/null 2>&1 || true
done
}
_purge_nvidia_host() {
msg_info "$(translate 'Uninstalling current NVIDIA driver from host...')"
_stop_nvidia_services
_unload_nvidia_modules
command -v nvidia-uninstall >/dev/null 2>&1 \
&& nvidia-uninstall --silent >>"$LOG_FILE" 2>&1 || true
# Remove DKMS entries
local dkms_versions
dkms_versions=$(dkms status 2>/dev/null | awk -F, '/nvidia/ {gsub(/ /,"",$2); print $2}' || true)
while IFS= read -r ver; do
[[ -z "$ver" ]] && continue
dkms remove -m nvidia -v "$ver" --all >/dev/null 2>&1 || true
done <<< "$dkms_versions"
apt-get -y purge 'nvidia-*' 'libnvidia-*' 'cuda-*' >>"$LOG_FILE" 2>&1 || true
apt-get -y autoremove --purge >>"$LOG_FILE" 2>&1 || true
rm -f /etc/udev/rules.d/70-nvidia.rules
rm -f /etc/modprobe.d/nvidia*.conf /usr/lib/modprobe.d/nvidia*.conf
msg_ok "$(translate 'Current NVIDIA driver removed from host.')"
}
_download_installer() {
local version="$1"
local run_file="${NVIDIA_WORKDIR}/NVIDIA-Linux-x86_64-${version}.run"
mkdir -p "$NVIDIA_WORKDIR"
# Reuse cached file if valid
local existing_size
existing_size=$(stat -c%s "$run_file" 2>/dev/null || echo "0")
if [[ -f "$run_file" ]] && [[ "$existing_size" -gt 40000000 ]]; then
if file "$run_file" 2>/dev/null | grep -q "executable"; then
msg_ok "$(translate 'Installer already cached.')"
echo "$run_file"
return 0
fi
fi
rm -f "$run_file"
msg_info "$(translate 'Downloading NVIDIA driver') ${version}..."
local urls=(
"${NVIDIA_BASE_URL}/${version}/NVIDIA-Linux-x86_64-${version}.run"
"${NVIDIA_BASE_URL}/${version}/NVIDIA-Linux-x86_64-${version}-no-compat32.run"
)
local ok=false
for url in "${urls[@]}"; do
if curl -fL --connect-timeout 30 --max-time 600 "$url" -o "$run_file" >>"$LOG_FILE" 2>&1; then
local sz
sz=$(stat -c%s "$run_file" 2>/dev/null || echo "0")
if [[ "$sz" -gt 40000000 ]] && file "$run_file" 2>/dev/null | grep -q "executable"; then
ok=true
break
fi
fi
rm -f "$run_file"
done
if ! $ok; then
msg_error "$(translate 'Download failed. Check /tmp/nvidia_update.log')"
exit 1
fi
chmod +x "$run_file"
msg_ok "$(translate 'Download complete.')"
echo "$run_file"
}
_run_installer() {
local installer="$1"
local tmp_dir="${NVIDIA_WORKDIR}/tmp_extract"
mkdir -p "$tmp_dir"
msg_info "$(translate 'Installing NVIDIA driver on host. This may take several minutes...')"
sh "$installer" \
--tmpdir="$tmp_dir" \
--no-questions \
--ui=none \
--disable-nouveau \
--no-nouveau-check \
--dkms \
>>"$LOG_FILE" 2>&1
local rc=$?
rm -rf "$tmp_dir"
if [[ $rc -ne 0 ]]; then
msg_error "$(translate 'NVIDIA installer failed. Check /tmp/nvidia_update.log')"
exit 1
fi
msg_ok "$(translate 'NVIDIA driver installed on host.')"
}
update_host_nvidia() {
local version="$1"
_purge_nvidia_host
local installer
installer=$(_download_installer "$version")
_run_installer "$installer"
msg_info "$(translate 'Updating initramfs...')"
update-initramfs -u -k all >>"$LOG_FILE" 2>&1 || true
msg_ok "$(translate 'initramfs updated.')"
}
# ============================================================
# Overview dialog (current state)
# ============================================================
show_current_state_dialog() {
find_nvidia_containers
local info
info="\n$(translate 'Host NVIDIA driver:') ${HOST_NVIDIA_VERSION}\n\n"
if [[ ${#NVIDIA_CONTAINERS[@]} -eq 0 ]]; then
info+="$(translate 'No LXC containers with NVIDIA passthrough found.')\n"
else
info+="$(translate 'LXC containers with NVIDIA passthrough:')\n\n"
for ctid in "${NVIDIA_CONTAINERS[@]}"; do
local lxc_ver
lxc_ver=$(get_lxc_nvidia_version "$ctid")
local ct_name
ct_name=$(pct config "$ctid" 2>/dev/null | grep "^hostname:" | awk '{print $2}')
info+=" CT ${ctid} ${ct_name:+(${ct_name})} — libcuda1: ${lxc_ver}\n"
done
fi
info+="\n$(translate 'After selecting a version, LXC containers will be updated first, then the host.')"
info+="\n$(translate 'A reboot is required after the host update.')"
dialog --backtitle "ProxMenux" \
--title "$(translate 'NVIDIA Update — Current State')" \
--yesno "$info" 20 80 \
>/dev/tty 2>&1 || exit 0
}
# ============================================================
# Restart prompt
# ============================================================
restart_prompt() {
if whiptail --title "$(translate 'NVIDIA Update')" --yesno \
"$(translate 'The host driver update requires a reboot to take effect. Reboot now?')" 10 70; then
msg_warn "$(translate 'Restarting the server...')"
reboot
else
msg_success "$(translate 'Update complete. Please reboot the server manually.')"
msg_success "$(translate 'Completed. Press Enter to return to menu...')"
read -r
fi
}
# ============================================================
# Main
# ============================================================
main() {
: >"$LOG_FILE"
# ---- Phase 1: dialogs ----
detect_host_nvidia
show_current_state_dialog
select_target_version
# Same version confirmation
if [[ "$TARGET_VERSION" == "$HOST_NVIDIA_VERSION" ]]; then
if ! dialog --backtitle "ProxMenux" \
--title "$(translate 'Same Version')" \
--yesno "\n$(translate 'Version') ${TARGET_VERSION} $(translate 'is already installed on the host.')\n\n$(translate 'Reinstall and force-update all LXC containers anyway?')" \
10 70 >/dev/tty 2>&1; then
exit 0
fi
fi
# ---- Phase 2: processing ----
show_proxmenux_logo
msg_title "$(translate 'NVIDIA Driver Update')"
# Download installer once — reused by both LXC containers and host
local run_file
run_file=$(_download_installer "$TARGET_VERSION")
# Update LXC containers first (no reboot needed for userspace libs)
if [[ ${#NVIDIA_CONTAINERS[@]} -gt 0 ]]; then
msg_info2 "$(translate 'Updating LXC containers...')"
for ctid in "${NVIDIA_CONTAINERS[@]}"; do
update_lxc_nvidia "$ctid" "$TARGET_VERSION"
done
fi
# Update host kernel module + drivers (reuses the already-downloaded installer)
update_host_nvidia "$TARGET_VERSION"
restart_prompt
}
main