From d2ef8f0899fdb07df2f9d966c23011cf58534aaa Mon Sep 17 00:00:00 2001 From: MacRimi Date: Thu, 28 May 2026 20:49:59 +0200 Subject: [PATCH] update add_gpu_vm.sh --- scripts/global/pci_passthrough_helpers.sh | 198 ++++++++++++++++++++++ scripts/gpu_tpu/add_gpu_vm.sh | 136 ++++++++++----- scripts/gpu_tpu/switch_gpu_mode.sh | 7 + scripts/gpu_tpu/switch_gpu_mode_direct.sh | 6 + 4 files changed, 307 insertions(+), 40 deletions(-) diff --git a/scripts/global/pci_passthrough_helpers.sh b/scripts/global/pci_passthrough_helpers.sh index 1317ff32..e11daaf7 100644 --- a/scripts/global/pci_passthrough_helpers.sh +++ b/scripts/global/pci_passthrough_helpers.sh @@ -355,3 +355,201 @@ function _pci_sriov_role() { fi echo "none" } + + +# ────────────────────────────────────────────────────────────────────── +# Per-BDF VFIO binding via udev rules (multi-GPU safe, battle-tested) +# ────────────────────────────────────────────────────────────────────── +# Writes one udev rule per BDF setting `ATTR{driver_override}="vfio-pci"`. +# udev applies this rule at the PCI ADD event BEFORE any driver (nvidia, +# amdgpu, i915) gets a chance to bind — when the kernel then tries to +# attach a driver, it sees driver_override and routes the device to +# vfio-pci instead. The native module (e.g. nvidia.ko) stays loaded for +# OTHER GPUs of the same vendor, so multi-GPU NVIDIA scenarios work. +# +# State file: /etc/proxmenux/vfio-bind.bdfs (one BDF per line, source of truth) +# Udev rules: /etc/udev/rules.d/10-proxmenux-vfio-bind.rules (regenerated +# from the state file every time it changes) +# +# Why udev and not the initramfs hook (init-top) that we tried first: +# init-top runs before sysfs is fully populated with PCI devices, and the +# driver_override write loses the race against the native driver claiming +# the device. Udev rules with ATTR{driver_override}= are processed at the +# PCI subsystem ADD event, which is exactly when we need them. +# ────────────────────────────────────────────────────────────────────── + +PROXMENUX_VFIO_BIND_STATE="/etc/proxmenux/vfio-bind.bdfs" +PROXMENUX_VFIO_BIND_UDEV_RULE="/etc/udev/rules.d/10-proxmenux-vfio-bind.rules" +# Legacy artifact paths from a previous attempt — kept here so we can +# remove them when migrating a host that ran the older init-top hook. +PROXMENUX_VFIO_BIND_LEGACY_HOOK="/etc/initramfs-tools/scripts/init-top/proxmenux-vfio-bind" + +_proxmenux_vfio_bind_write_udev_rule() { + # Always nuke the obsolete init-top hook from earlier attempts (if it + # still exists) so a stale copy in initramfs can't run alongside the + # udev rule. + _proxmenux_vfio_bind_cleanup_legacy + + # Regenerates the udev rule file from the current state file. + # No-op if state file is empty (rule file removed). + if [[ ! -s "$PROXMENUX_VFIO_BIND_STATE" ]]; then + rm -f "$PROXMENUX_VFIO_BIND_UDEV_RULE" + return 0 + fi + + mkdir -p "$(dirname "$PROXMENUX_VFIO_BIND_UDEV_RULE")" + { + echo "# ProxMenux: per-BDF VFIO driver override" + echo "# Auto-generated from $PROXMENUX_VFIO_BIND_STATE" + echo "# DO NOT EDIT MANUALLY — regenerated by add_gpu_vm.sh / switch_gpu_mode*.sh" + while IFS= read -r bdf; do + [[ -z "$bdf" ]] && continue + [[ "$bdf" == \#* ]] && continue + # KERNEL match expects the "0000:XX:YY.Z" form + local full="$bdf" + [[ "$full" != 0000:* ]] && full="0000:${full}" + echo "SUBSYSTEM==\"pci\", KERNEL==\"${full}\", ATTR{driver_override}=\"vfio-pci\"" + done < "$PROXMENUX_VFIO_BIND_STATE" + } > "$PROXMENUX_VFIO_BIND_UDEV_RULE" + + udevadm control --reload-rules >/dev/null 2>&1 || true +} + +# Cleanup helper: remove the obsolete init-top hook from a prior model. +# Called transparently by _add/_remove so any host that ran the older +# version of this helper self-heals. +_proxmenux_vfio_bind_cleanup_legacy() { + if [[ -f "$PROXMENUX_VFIO_BIND_LEGACY_HOOK" ]]; then + rm -f "$PROXMENUX_VFIO_BIND_LEGACY_HOOK" + [[ -n "${HOST_CONFIG_CHANGED+x}" ]] && HOST_CONFIG_CHANGED=true + fi +} + +_proxmenux_vfio_bind_add_bdfs() { + # Args: any number of BDFs ("01:00.0" or "0000:01:00.0") + mkdir -p "$(dirname "$PROXMENUX_VFIO_BIND_STATE")" + touch "$PROXMENUX_VFIO_BIND_STATE" + _proxmenux_vfio_bind_cleanup_legacy + + local changed=false bdf normalized + for bdf in "$@"; do + [[ -z "$bdf" ]] && continue + # Normalize to "0000:XX:YY.Z" + if [[ "$bdf" == 0000:* ]]; then + normalized="$bdf" + else + normalized="0000:${bdf}" + fi + if ! grep -qxF "$normalized" "$PROXMENUX_VFIO_BIND_STATE" 2>/dev/null; then + echo "$normalized" >> "$PROXMENUX_VFIO_BIND_STATE" + changed=true + fi + done + if $changed; then + _proxmenux_vfio_bind_write_udev_rule + [[ -n "${HOST_CONFIG_CHANGED+x}" ]] && HOST_CONFIG_CHANGED=true + fi +} + +_proxmenux_vfio_bind_remove_bdfs() { + # Args: any number of BDFs to remove from the binder list + [[ -f "$PROXMENUX_VFIO_BIND_STATE" ]] || return 0 + _proxmenux_vfio_bind_cleanup_legacy + + local bdf normalized tmp + tmp=$(mktemp) + cp "$PROXMENUX_VFIO_BIND_STATE" "$tmp" + for bdf in "$@"; do + [[ -z "$bdf" ]] && continue + if [[ "$bdf" == 0000:* ]]; then + normalized="$bdf" + else + normalized="0000:${bdf}" + fi + sed -i "\|^${normalized}\$|d" "$tmp" + done + if ! cmp -s "$tmp" "$PROXMENUX_VFIO_BIND_STATE"; then + mv "$tmp" "$PROXMENUX_VFIO_BIND_STATE" + _proxmenux_vfio_bind_write_udev_rule + [[ -n "${HOST_CONFIG_CHANGED+x}" ]] && HOST_CONFIG_CHANGED=true + # If empty, remove state file too (keeps host clean) + [[ ! -s "$PROXMENUX_VFIO_BIND_STATE" ]] && rm -f "$PROXMENUX_VFIO_BIND_STATE" + else + rm -f "$tmp" + fi +} + +_proxmenux_vfio_bind_purge_vendor() { + # Removes every BDF from the binder state whose PCI vendor matches $1 + # (hex, e.g. "10de" for NVIDIA, "1002" for AMD, "8086" for Intel). + # Used by switch_gpu_mode to drop all NVIDIA bindings when reverting + # NVIDIA passthrough — the nvidia module reclaims the GPUs after the + # next reboot. + local target_vendor="${1,,}" + [[ -z "$target_vendor" || ! -f "$PROXMENUX_VFIO_BIND_STATE" ]] && return 0 + + local -a to_remove=() + local bdf vendor_hex + while IFS= read -r bdf; do + [[ -z "$bdf" ]] && continue + case "$bdf" in \#*) continue ;; esac + local full="$bdf" + [[ "$full" != 0000:* ]] && full="0000:${full}" + vendor_hex=$(cat "/sys/bus/pci/devices/${full}/vendor" 2>/dev/null | sed 's/^0x//' | tr '[:upper:]' '[:lower:]') + [[ "$vendor_hex" == "$target_vendor" ]] && to_remove+=("$full") + done < "$PROXMENUX_VFIO_BIND_STATE" + + [[ ${#to_remove[@]} -gt 0 ]] && _proxmenux_vfio_bind_remove_bdfs "${to_remove[@]}" +} + +# ────────────────────────────────────────────────────────────────────── +# Auto-migrate hosts that ran the previous (broken) global-blacklist +# model. Idempotent, safe if nothing matches. Removes the global kill- +# switches so the nvidia module can load again for the GPU(s) NOT being +# passed through. +# ────────────────────────────────────────────────────────────────────── +_proxmenux_nvidia_migrate_legacy_blacklist() { + local changed=false + local blacklist_file="/etc/modprobe.d/blacklist.conf" + local nvidia_blacklist="/etc/modprobe.d/nvidia-blacklist.conf" + local udev_disabled="/etc/udev/rules.d/70-nvidia.rules.proxmenux-disabled" + local udev_rules="/etc/udev/rules.d/70-nvidia.rules" + local modules_load_disabled="/etc/modules-load.d/nvidia-vfio.conf.proxmenux-disabled-vfio" + local modules_load_active="/etc/modules-load.d/nvidia-vfio.conf" + + if [[ -f "$blacklist_file" ]] && grep -qE '^blacklist (nvidia|nvidia_drm|nvidia_modeset|nvidia_uvm|nvidiafb)$' "$blacklist_file"; then + sed -i \ + -e '/^blacklist nvidia$/d' \ + -e '/^blacklist nvidia_drm$/d' \ + -e '/^blacklist nvidia_modeset$/d' \ + -e '/^blacklist nvidia_uvm$/d' \ + -e '/^blacklist nvidiafb$/d' \ + "$blacklist_file" + changed=true + fi + + if [[ -f "$nvidia_blacklist" ]]; then + rm -f "$nvidia_blacklist" + changed=true + fi + + if [[ -f "$udev_disabled" ]]; then + mv "$udev_disabled" "$udev_rules" >/dev/null 2>&1 || true + udevadm control --reload-rules >/dev/null 2>&1 || true + changed=true + fi + + if [[ -f "$modules_load_disabled" ]]; then + mv "$modules_load_disabled" "$modules_load_active" >/dev/null 2>&1 || true + changed=true + fi + + if $changed; then + [[ -n "${HOST_CONFIG_CHANGED+x}" ]] && HOST_CONFIG_CHANGED=true + if declare -F msg_ok >/dev/null 2>&1; then + msg_ok "$(declare -F translate >/dev/null 2>&1 && translate 'Migrated legacy ProxMenux NVIDIA blacklist state — module will reload after reboot' || echo 'Migrated legacy ProxMenux NVIDIA blacklist state — module will reload after reboot')" + else + echo "[OK] Migrated legacy ProxMenux NVIDIA blacklist state — module will reload after reboot" + fi + fi +} diff --git a/scripts/gpu_tpu/add_gpu_vm.sh b/scripts/gpu_tpu/add_gpu_vm.sh index 82f22285..952f63f3 100644 --- a/scripts/gpu_tpu/add_gpu_vm.sh +++ b/scripts/gpu_tpu/add_gpu_vm.sh @@ -1603,10 +1603,72 @@ add_vfio_modules() { # ── vfio-pci IDs — merge with existing ones ───────────── configure_vfio_pci_ids() { - msg_info "$(translate 'Configuring vfio-pci device IDs...')" + msg_info "$(translate 'Configuring vfio-pci binding...')" local vfio_conf="/etc/modprobe.d/vfio.conf" touch "$vfio_conf" + # ──────────────────────────────────────────────────────────────── + # NVIDIA: per-BDF binding (multi-GPU safe). The `options vfio-pci + # ids=VENDOR:DEVICE` approach captures EVERY GPU with the same + # vendor:device ID — fatal when two NVIDIA GPUs share a model. + # Instead, we list the exact BDF(s) of the target GPU in the + # initramfs hook, and add `softdep nvidia pre: vfio-pci` so vfio + # has a chance to claim the BDF before nvidia loads. + # ──────────────────────────────────────────────────────────────── + if [[ "$SELECTED_GPU" == "nvidia" ]]; then + # Clean up any previous ids= line that captured this NVIDIA + # (older versions of this script wrote it; remove to avoid + # collateral grabs on sibling GPUs of the same model). + if grep -qE '^options vfio-pci ids=' "$vfio_conf" 2>/dev/null; then + local existing_line ids_part + existing_line=$(grep '^options vfio-pci ids=' "$vfio_conf" | head -1) + ids_part=$(echo "$existing_line" | grep -oE 'ids=[^[:space:]]+' | sed 's/ids=//') + + local kept=() + IFS=',' read -ra existing_ids <<< "$ids_part" + for eid in "${existing_ids[@]}"; do + local drop=false + for nvid in "${IOMMU_VFIO_IDS[@]}"; do + [[ "$eid" == "$nvid" ]] && drop=true && break + done + $drop || kept+=("$eid") + done + + sed -i '/^options vfio-pci ids=/d' "$vfio_conf" + if [[ ${#kept[@]} -gt 0 ]]; then + local kept_str + kept_str=$(IFS=','; echo "${kept[*]}") + echo "options vfio-pci ids=${kept_str} disable_vga=1" >> "$vfio_conf" + fi + HOST_CONFIG_CHANGED=true + fi + + # Ensure vfio loads before nvidia so the per-BDF override wins. + _add_line_if_missing "softdep nvidia pre: vfio-pci" "$vfio_conf" + _add_line_if_missing "softdep nvidia_drm pre: vfio-pci" "$vfio_conf" + _add_line_if_missing "softdep nvidia_modeset pre: vfio-pci" "$vfio_conf" + _add_line_if_missing "softdep nvidia_uvm pre: vfio-pci" "$vfio_conf" + + # Per-BDF binder hook. IOMMU_DEVICES has the BDFs for the GPU + # we're passing (and any same-group functions like the audio + # function). Add all of them so the whole IOMMU group goes to + # vfio-pci as Proxmox expects. + local -a bdfs_to_bind=() + for bdf in "${IOMMU_DEVICES[@]}"; do + bdfs_to_bind+=("$bdf") + done + _proxmenux_vfio_bind_add_bdfs "${bdfs_to_bind[@]}" + + msg_ok "$(translate 'NVIDIA per-BDF VFIO binding configured') (${bdfs_to_bind[*]})" | tee -a "$screen_capture" + return 0 + fi + + # ──────────────────────────────────────────────────────────────── + # AMD / Intel: keep the legacy options vfio-pci ids= approach. + # These vendors rarely run multi-GPU same-model on the same host, + # and their drivers don't have the kill-switch problem nvidia has. + # ──────────────────────────────────────────────────────────────── + # Collect existing IDs (if any) local existing_ids=() local existing_line @@ -1671,12 +1733,13 @@ blacklist_gpu_drivers() { case "$SELECTED_GPU" in nvidia) + # Only blacklist the open-source `nouveau` driver — never the + # proprietary `nvidia` module. Blacklisting nvidia globally + # would kill any OTHER NVIDIA GPU that should stay on the host + # (multi-GPU NVIDIA scenarios). The VFIO binding for the GPUs + # passed through is handled by `proxmenux-vfio-bind` via per-BDF + # driver_override + softdep nvidia pre: vfio-pci. _add_line_if_missing "blacklist nouveau" "$blacklist_file" - _add_line_if_missing "blacklist nvidia" "$blacklist_file" - _add_line_if_missing "blacklist nvidia_drm" "$blacklist_file" - _add_line_if_missing "blacklist nvidia_modeset" "$blacklist_file" - _add_line_if_missing "blacklist nvidia_uvm" "$blacklist_file" - _add_line_if_missing "blacklist nvidiafb" "$blacklist_file" _add_line_if_missing "blacklist lbm-nouveau" "$blacklist_file" _add_line_if_missing "options nouveau modeset=0" "$blacklist_file" ;; @@ -1692,6 +1755,18 @@ blacklist_gpu_drivers() { } sanitize_nvidia_host_stack_for_vfio() { + # In the new per-BDF model we only stop systemd services that could + # actively probe / lock GPUs at boot (persistenced) — but we DO NOT: + # - blacklist the nvidia kernel module + # - remove nvidia entries from /etc/modules + # - rename /etc/modules-load.d/nvidia-vfio.conf + # - rename /etc/udev/rules.d/70-nvidia.rules + # - create /etc/modprobe.d/nvidia-blacklist.conf with install /bin/false + # All of those were global and broke multi-GPU NVIDIA scenarios where + # one GPU goes to a VM (vfio-pci) and another stays on the host + # (nvidia driver). VFIO binding is now per-BDF via driver_override in + # an initramfs hook — the nvidia module stays usable for any GPU not + # explicitly targeted. msg_info "$(translate 'Sanitizing NVIDIA host services for VFIO mode...')" local changed=false local state_dir="/var/lib/proxmenux" @@ -1730,46 +1805,21 @@ sanitize_nvidia_host_stack_for_vfio() { [[ -s "$state_file" ]] || rm -f "$state_file" - if [[ -f /etc/modules-load.d/nvidia-vfio.conf ]]; then - mv /etc/modules-load.d/nvidia-vfio.conf /etc/modules-load.d/nvidia-vfio.conf.proxmenux-disabled-vfio >>"$LOG_FILE" 2>&1 || true - changed=true - fi - - if grep -qE '^(nvidia|nvidia_uvm|nvidia_drm|nvidia_modeset)$' /etc/modules 2>/dev/null; then - sed -i '/^nvidia$/d;/^nvidia_uvm$/d;/^nvidia_drm$/d;/^nvidia_modeset$/d' /etc/modules - changed=true - fi - - # Disable NVIDIA udev rules that trigger nvidia-smi (causes conflict with vfio-pci) - local udev_rules="/etc/udev/rules.d/70-nvidia.rules" - if [[ -f "$udev_rules" ]]; then - mv "$udev_rules" "${udev_rules}.proxmenux-disabled" >>"$LOG_FILE" 2>&1 || true - udevadm control --reload-rules >>"$LOG_FILE" 2>&1 || true - changed=true - fi - - # Create hard blacklist to prevent ANY nvidia module loading (even via modprobe/nvidia-smi) - local nvidia_blacklist="/etc/modprobe.d/nvidia-blacklist.conf" - if [[ ! -f "$nvidia_blacklist" ]]; then - cat > "$nvidia_blacklist" <<'EOF' -# ProxMenux: Hard blacklist to prevent ANY nvidia module loading in VFIO mode -# This prevents nvidia-smi and other tools from triggering module load attempts -install nvidia /bin/false -install nvidia_uvm /bin/false -install nvidia_drm /bin/false -install nvidia_modeset /bin/false -EOF - changed=true - fi - if $changed; then HOST_CONFIG_CHANGED=true - msg_ok "$(translate 'NVIDIA host services/autoload disabled for VFIO mode')" | tee -a "$screen_capture" + msg_ok "$(translate 'NVIDIA host services disabled for VFIO mode')" | tee -a "$screen_capture" else - msg_ok "$(translate 'NVIDIA host services/autoload already aligned for VFIO mode')" | tee -a "$screen_capture" + msg_ok "$(translate 'NVIDIA host services already aligned for VFIO mode')" | tee -a "$screen_capture" fi } +# Per-BDF VFIO binder + legacy NVIDIA blacklist migration are defined in +# scripts/global/pci_passthrough_helpers.sh (sourced at the top of this file). +# Functions exposed there: +# _proxmenux_vfio_bind_add_bdfs +# _proxmenux_vfio_bind_remove_bdfs +# _proxmenux_nvidia_migrate_legacy_blacklist + # ── AMD ROM dump: sysfs first, VFCT ACPI table as fallback ─────────────── _dump_rom_via_vfct() { @@ -2187,6 +2237,12 @@ main() { msg_title "${run_title}" fi + # Auto-migrate any leftover state from the previous (broken) global + # NVIDIA blacklist model BEFORE applying new config. Idempotent: no-op + # on clean hosts. Always runs in the NVIDIA flow so a host that was + # configured with an old ProxMenux release self-heals on the next run. + [[ "$SELECTED_GPU" == "nvidia" ]] && _proxmenux_nvidia_migrate_legacy_blacklist + if [[ "$VM_SWITCH_ALREADY_VFIO" == "true" ]]; then msg_ok "$(translate 'Host already in VFIO mode — skipping host reconfiguration for VM reassignment')" | tee -a "$screen_capture" else diff --git a/scripts/gpu_tpu/switch_gpu_mode.sh b/scripts/gpu_tpu/switch_gpu_mode.sh index d302bf50..15b42fe1 100644 --- a/scripts/gpu_tpu/switch_gpu_mode.sh +++ b/scripts/gpu_tpu/switch_gpu_mode.sh @@ -346,6 +346,13 @@ _restore_nvidia_host_stack_for_lxc() { local disabled_file="/etc/modules-load.d/nvidia-vfio.conf.proxmenux-disabled-vfio" local active_file="/etc/modules-load.d/nvidia-vfio.conf" + # New per-BDF model: drop every NVIDIA BDF from the initramfs binder so + # the nvidia module reclaims the GPU after the next reboot. Idempotent: + # no-op if no NVIDIA BDFs are tracked. Vendor 10de = NVIDIA. + if declare -F _proxmenux_vfio_bind_purge_vendor >/dev/null 2>&1; then + _proxmenux_vfio_bind_purge_vendor "10de" && changed=true + fi + # Remove hard blacklist that was preventing nvidia module loading local nvidia_blacklist="/etc/modprobe.d/nvidia-blacklist.conf" if [[ -f "$nvidia_blacklist" ]]; then diff --git a/scripts/gpu_tpu/switch_gpu_mode_direct.sh b/scripts/gpu_tpu/switch_gpu_mode_direct.sh index de32909e..b5652d07 100644 --- a/scripts/gpu_tpu/switch_gpu_mode_direct.sh +++ b/scripts/gpu_tpu/switch_gpu_mode_direct.sh @@ -324,6 +324,12 @@ _restore_nvidia_host_stack_for_lxc() { local disabled_file="/etc/modules-load.d/nvidia-vfio.conf.proxmenux-disabled-vfio" local active_file="/etc/modules-load.d/nvidia-vfio.conf" + # New per-BDF model: drop every NVIDIA BDF from the initramfs binder so + # the nvidia module reclaims the GPU after the next reboot. Idempotent. + if declare -F _proxmenux_vfio_bind_purge_vendor >/dev/null 2>&1; then + _proxmenux_vfio_bind_purge_vendor "10de" && changed=true + fi + # Remove hard blacklist that was preventing nvidia module loading local nvidia_blacklist="/etc/modprobe.d/nvidia-blacklist.conf" if [[ -f "$nvidia_blacklist" ]]; then