mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-05-31 04:24:44 +00:00
update add_gpu_vm.sh
This commit is contained in:
@@ -355,3 +355,201 @@ function _pci_sriov_role() {
|
|||||||
fi
|
fi
|
||||||
echo "none"
|
echo "none"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
# Per-BDF VFIO binding via udev rules (multi-GPU safe, battle-tested)
|
||||||
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
# Writes one udev rule per BDF setting `ATTR{driver_override}="vfio-pci"`.
|
||||||
|
# udev applies this rule at the PCI ADD event BEFORE any driver (nvidia,
|
||||||
|
# amdgpu, i915) gets a chance to bind — when the kernel then tries to
|
||||||
|
# attach a driver, it sees driver_override and routes the device to
|
||||||
|
# vfio-pci instead. The native module (e.g. nvidia.ko) stays loaded for
|
||||||
|
# OTHER GPUs of the same vendor, so multi-GPU NVIDIA scenarios work.
|
||||||
|
#
|
||||||
|
# State file: /etc/proxmenux/vfio-bind.bdfs (one BDF per line, source of truth)
|
||||||
|
# Udev rules: /etc/udev/rules.d/10-proxmenux-vfio-bind.rules (regenerated
|
||||||
|
# from the state file every time it changes)
|
||||||
|
#
|
||||||
|
# Why udev and not the initramfs hook (init-top) that we tried first:
|
||||||
|
# init-top runs before sysfs is fully populated with PCI devices, and the
|
||||||
|
# driver_override write loses the race against the native driver claiming
|
||||||
|
# the device. Udev rules with ATTR{driver_override}= are processed at the
|
||||||
|
# PCI subsystem ADD event, which is exactly when we need them.
|
||||||
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
PROXMENUX_VFIO_BIND_STATE="/etc/proxmenux/vfio-bind.bdfs"
|
||||||
|
PROXMENUX_VFIO_BIND_UDEV_RULE="/etc/udev/rules.d/10-proxmenux-vfio-bind.rules"
|
||||||
|
# Legacy artifact paths from a previous attempt — kept here so we can
|
||||||
|
# remove them when migrating a host that ran the older init-top hook.
|
||||||
|
PROXMENUX_VFIO_BIND_LEGACY_HOOK="/etc/initramfs-tools/scripts/init-top/proxmenux-vfio-bind"
|
||||||
|
|
||||||
|
_proxmenux_vfio_bind_write_udev_rule() {
|
||||||
|
# Always nuke the obsolete init-top hook from earlier attempts (if it
|
||||||
|
# still exists) so a stale copy in initramfs can't run alongside the
|
||||||
|
# udev rule.
|
||||||
|
_proxmenux_vfio_bind_cleanup_legacy
|
||||||
|
|
||||||
|
# Regenerates the udev rule file from the current state file.
|
||||||
|
# No-op if state file is empty (rule file removed).
|
||||||
|
if [[ ! -s "$PROXMENUX_VFIO_BIND_STATE" ]]; then
|
||||||
|
rm -f "$PROXMENUX_VFIO_BIND_UDEV_RULE"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p "$(dirname "$PROXMENUX_VFIO_BIND_UDEV_RULE")"
|
||||||
|
{
|
||||||
|
echo "# ProxMenux: per-BDF VFIO driver override"
|
||||||
|
echo "# Auto-generated from $PROXMENUX_VFIO_BIND_STATE"
|
||||||
|
echo "# DO NOT EDIT MANUALLY — regenerated by add_gpu_vm.sh / switch_gpu_mode*.sh"
|
||||||
|
while IFS= read -r bdf; do
|
||||||
|
[[ -z "$bdf" ]] && continue
|
||||||
|
[[ "$bdf" == \#* ]] && continue
|
||||||
|
# KERNEL match expects the "0000:XX:YY.Z" form
|
||||||
|
local full="$bdf"
|
||||||
|
[[ "$full" != 0000:* ]] && full="0000:${full}"
|
||||||
|
echo "SUBSYSTEM==\"pci\", KERNEL==\"${full}\", ATTR{driver_override}=\"vfio-pci\""
|
||||||
|
done < "$PROXMENUX_VFIO_BIND_STATE"
|
||||||
|
} > "$PROXMENUX_VFIO_BIND_UDEV_RULE"
|
||||||
|
|
||||||
|
udevadm control --reload-rules >/dev/null 2>&1 || true
|
||||||
|
}
|
||||||
|
|
||||||
|
# Cleanup helper: remove the obsolete init-top hook from a prior model.
|
||||||
|
# Called transparently by _add/_remove so any host that ran the older
|
||||||
|
# version of this helper self-heals.
|
||||||
|
_proxmenux_vfio_bind_cleanup_legacy() {
|
||||||
|
if [[ -f "$PROXMENUX_VFIO_BIND_LEGACY_HOOK" ]]; then
|
||||||
|
rm -f "$PROXMENUX_VFIO_BIND_LEGACY_HOOK"
|
||||||
|
[[ -n "${HOST_CONFIG_CHANGED+x}" ]] && HOST_CONFIG_CHANGED=true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
_proxmenux_vfio_bind_add_bdfs() {
|
||||||
|
# Args: any number of BDFs ("01:00.0" or "0000:01:00.0")
|
||||||
|
mkdir -p "$(dirname "$PROXMENUX_VFIO_BIND_STATE")"
|
||||||
|
touch "$PROXMENUX_VFIO_BIND_STATE"
|
||||||
|
_proxmenux_vfio_bind_cleanup_legacy
|
||||||
|
|
||||||
|
local changed=false bdf normalized
|
||||||
|
for bdf in "$@"; do
|
||||||
|
[[ -z "$bdf" ]] && continue
|
||||||
|
# Normalize to "0000:XX:YY.Z"
|
||||||
|
if [[ "$bdf" == 0000:* ]]; then
|
||||||
|
normalized="$bdf"
|
||||||
|
else
|
||||||
|
normalized="0000:${bdf}"
|
||||||
|
fi
|
||||||
|
if ! grep -qxF "$normalized" "$PROXMENUX_VFIO_BIND_STATE" 2>/dev/null; then
|
||||||
|
echo "$normalized" >> "$PROXMENUX_VFIO_BIND_STATE"
|
||||||
|
changed=true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if $changed; then
|
||||||
|
_proxmenux_vfio_bind_write_udev_rule
|
||||||
|
[[ -n "${HOST_CONFIG_CHANGED+x}" ]] && HOST_CONFIG_CHANGED=true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
_proxmenux_vfio_bind_remove_bdfs() {
|
||||||
|
# Args: any number of BDFs to remove from the binder list
|
||||||
|
[[ -f "$PROXMENUX_VFIO_BIND_STATE" ]] || return 0
|
||||||
|
_proxmenux_vfio_bind_cleanup_legacy
|
||||||
|
|
||||||
|
local bdf normalized tmp
|
||||||
|
tmp=$(mktemp)
|
||||||
|
cp "$PROXMENUX_VFIO_BIND_STATE" "$tmp"
|
||||||
|
for bdf in "$@"; do
|
||||||
|
[[ -z "$bdf" ]] && continue
|
||||||
|
if [[ "$bdf" == 0000:* ]]; then
|
||||||
|
normalized="$bdf"
|
||||||
|
else
|
||||||
|
normalized="0000:${bdf}"
|
||||||
|
fi
|
||||||
|
sed -i "\|^${normalized}\$|d" "$tmp"
|
||||||
|
done
|
||||||
|
if ! cmp -s "$tmp" "$PROXMENUX_VFIO_BIND_STATE"; then
|
||||||
|
mv "$tmp" "$PROXMENUX_VFIO_BIND_STATE"
|
||||||
|
_proxmenux_vfio_bind_write_udev_rule
|
||||||
|
[[ -n "${HOST_CONFIG_CHANGED+x}" ]] && HOST_CONFIG_CHANGED=true
|
||||||
|
# If empty, remove state file too (keeps host clean)
|
||||||
|
[[ ! -s "$PROXMENUX_VFIO_BIND_STATE" ]] && rm -f "$PROXMENUX_VFIO_BIND_STATE"
|
||||||
|
else
|
||||||
|
rm -f "$tmp"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
_proxmenux_vfio_bind_purge_vendor() {
|
||||||
|
# Removes every BDF from the binder state whose PCI vendor matches $1
|
||||||
|
# (hex, e.g. "10de" for NVIDIA, "1002" for AMD, "8086" for Intel).
|
||||||
|
# Used by switch_gpu_mode to drop all NVIDIA bindings when reverting
|
||||||
|
# NVIDIA passthrough — the nvidia module reclaims the GPUs after the
|
||||||
|
# next reboot.
|
||||||
|
local target_vendor="${1,,}"
|
||||||
|
[[ -z "$target_vendor" || ! -f "$PROXMENUX_VFIO_BIND_STATE" ]] && return 0
|
||||||
|
|
||||||
|
local -a to_remove=()
|
||||||
|
local bdf vendor_hex
|
||||||
|
while IFS= read -r bdf; do
|
||||||
|
[[ -z "$bdf" ]] && continue
|
||||||
|
case "$bdf" in \#*) continue ;; esac
|
||||||
|
local full="$bdf"
|
||||||
|
[[ "$full" != 0000:* ]] && full="0000:${full}"
|
||||||
|
vendor_hex=$(cat "/sys/bus/pci/devices/${full}/vendor" 2>/dev/null | sed 's/^0x//' | tr '[:upper:]' '[:lower:]')
|
||||||
|
[[ "$vendor_hex" == "$target_vendor" ]] && to_remove+=("$full")
|
||||||
|
done < "$PROXMENUX_VFIO_BIND_STATE"
|
||||||
|
|
||||||
|
[[ ${#to_remove[@]} -gt 0 ]] && _proxmenux_vfio_bind_remove_bdfs "${to_remove[@]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
# Auto-migrate hosts that ran the previous (broken) global-blacklist
|
||||||
|
# model. Idempotent, safe if nothing matches. Removes the global kill-
|
||||||
|
# switches so the nvidia module can load again for the GPU(s) NOT being
|
||||||
|
# passed through.
|
||||||
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
_proxmenux_nvidia_migrate_legacy_blacklist() {
|
||||||
|
local changed=false
|
||||||
|
local blacklist_file="/etc/modprobe.d/blacklist.conf"
|
||||||
|
local nvidia_blacklist="/etc/modprobe.d/nvidia-blacklist.conf"
|
||||||
|
local udev_disabled="/etc/udev/rules.d/70-nvidia.rules.proxmenux-disabled"
|
||||||
|
local udev_rules="/etc/udev/rules.d/70-nvidia.rules"
|
||||||
|
local modules_load_disabled="/etc/modules-load.d/nvidia-vfio.conf.proxmenux-disabled-vfio"
|
||||||
|
local modules_load_active="/etc/modules-load.d/nvidia-vfio.conf"
|
||||||
|
|
||||||
|
if [[ -f "$blacklist_file" ]] && grep -qE '^blacklist (nvidia|nvidia_drm|nvidia_modeset|nvidia_uvm|nvidiafb)$' "$blacklist_file"; then
|
||||||
|
sed -i \
|
||||||
|
-e '/^blacklist nvidia$/d' \
|
||||||
|
-e '/^blacklist nvidia_drm$/d' \
|
||||||
|
-e '/^blacklist nvidia_modeset$/d' \
|
||||||
|
-e '/^blacklist nvidia_uvm$/d' \
|
||||||
|
-e '/^blacklist nvidiafb$/d' \
|
||||||
|
"$blacklist_file"
|
||||||
|
changed=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "$nvidia_blacklist" ]]; then
|
||||||
|
rm -f "$nvidia_blacklist"
|
||||||
|
changed=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "$udev_disabled" ]]; then
|
||||||
|
mv "$udev_disabled" "$udev_rules" >/dev/null 2>&1 || true
|
||||||
|
udevadm control --reload-rules >/dev/null 2>&1 || true
|
||||||
|
changed=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "$modules_load_disabled" ]]; then
|
||||||
|
mv "$modules_load_disabled" "$modules_load_active" >/dev/null 2>&1 || true
|
||||||
|
changed=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
if $changed; then
|
||||||
|
[[ -n "${HOST_CONFIG_CHANGED+x}" ]] && HOST_CONFIG_CHANGED=true
|
||||||
|
if declare -F msg_ok >/dev/null 2>&1; then
|
||||||
|
msg_ok "$(declare -F translate >/dev/null 2>&1 && translate 'Migrated legacy ProxMenux NVIDIA blacklist state — module will reload after reboot' || echo 'Migrated legacy ProxMenux NVIDIA blacklist state — module will reload after reboot')"
|
||||||
|
else
|
||||||
|
echo "[OK] Migrated legacy ProxMenux NVIDIA blacklist state — module will reload after reboot"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|||||||
@@ -1603,10 +1603,72 @@ add_vfio_modules() {
|
|||||||
|
|
||||||
# ── vfio-pci IDs — merge with existing ones ─────────────
|
# ── vfio-pci IDs — merge with existing ones ─────────────
|
||||||
configure_vfio_pci_ids() {
|
configure_vfio_pci_ids() {
|
||||||
msg_info "$(translate 'Configuring vfio-pci device IDs...')"
|
msg_info "$(translate 'Configuring vfio-pci binding...')"
|
||||||
local vfio_conf="/etc/modprobe.d/vfio.conf"
|
local vfio_conf="/etc/modprobe.d/vfio.conf"
|
||||||
touch "$vfio_conf"
|
touch "$vfio_conf"
|
||||||
|
|
||||||
|
# ────────────────────────────────────────────────────────────────
|
||||||
|
# NVIDIA: per-BDF binding (multi-GPU safe). The `options vfio-pci
|
||||||
|
# ids=VENDOR:DEVICE` approach captures EVERY GPU with the same
|
||||||
|
# vendor:device ID — fatal when two NVIDIA GPUs share a model.
|
||||||
|
# Instead, we list the exact BDF(s) of the target GPU in the
|
||||||
|
# initramfs hook, and add `softdep nvidia pre: vfio-pci` so vfio
|
||||||
|
# has a chance to claim the BDF before nvidia loads.
|
||||||
|
# ────────────────────────────────────────────────────────────────
|
||||||
|
if [[ "$SELECTED_GPU" == "nvidia" ]]; then
|
||||||
|
# Clean up any previous ids= line that captured this NVIDIA
|
||||||
|
# (older versions of this script wrote it; remove to avoid
|
||||||
|
# collateral grabs on sibling GPUs of the same model).
|
||||||
|
if grep -qE '^options vfio-pci ids=' "$vfio_conf" 2>/dev/null; then
|
||||||
|
local existing_line ids_part
|
||||||
|
existing_line=$(grep '^options vfio-pci ids=' "$vfio_conf" | head -1)
|
||||||
|
ids_part=$(echo "$existing_line" | grep -oE 'ids=[^[:space:]]+' | sed 's/ids=//')
|
||||||
|
|
||||||
|
local kept=()
|
||||||
|
IFS=',' read -ra existing_ids <<< "$ids_part"
|
||||||
|
for eid in "${existing_ids[@]}"; do
|
||||||
|
local drop=false
|
||||||
|
for nvid in "${IOMMU_VFIO_IDS[@]}"; do
|
||||||
|
[[ "$eid" == "$nvid" ]] && drop=true && break
|
||||||
|
done
|
||||||
|
$drop || kept+=("$eid")
|
||||||
|
done
|
||||||
|
|
||||||
|
sed -i '/^options vfio-pci ids=/d' "$vfio_conf"
|
||||||
|
if [[ ${#kept[@]} -gt 0 ]]; then
|
||||||
|
local kept_str
|
||||||
|
kept_str=$(IFS=','; echo "${kept[*]}")
|
||||||
|
echo "options vfio-pci ids=${kept_str} disable_vga=1" >> "$vfio_conf"
|
||||||
|
fi
|
||||||
|
HOST_CONFIG_CHANGED=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Ensure vfio loads before nvidia so the per-BDF override wins.
|
||||||
|
_add_line_if_missing "softdep nvidia pre: vfio-pci" "$vfio_conf"
|
||||||
|
_add_line_if_missing "softdep nvidia_drm pre: vfio-pci" "$vfio_conf"
|
||||||
|
_add_line_if_missing "softdep nvidia_modeset pre: vfio-pci" "$vfio_conf"
|
||||||
|
_add_line_if_missing "softdep nvidia_uvm pre: vfio-pci" "$vfio_conf"
|
||||||
|
|
||||||
|
# Per-BDF binder hook. IOMMU_DEVICES has the BDFs for the GPU
|
||||||
|
# we're passing (and any same-group functions like the audio
|
||||||
|
# function). Add all of them so the whole IOMMU group goes to
|
||||||
|
# vfio-pci as Proxmox expects.
|
||||||
|
local -a bdfs_to_bind=()
|
||||||
|
for bdf in "${IOMMU_DEVICES[@]}"; do
|
||||||
|
bdfs_to_bind+=("$bdf")
|
||||||
|
done
|
||||||
|
_proxmenux_vfio_bind_add_bdfs "${bdfs_to_bind[@]}"
|
||||||
|
|
||||||
|
msg_ok "$(translate 'NVIDIA per-BDF VFIO binding configured') (${bdfs_to_bind[*]})" | tee -a "$screen_capture"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ────────────────────────────────────────────────────────────────
|
||||||
|
# AMD / Intel: keep the legacy options vfio-pci ids= approach.
|
||||||
|
# These vendors rarely run multi-GPU same-model on the same host,
|
||||||
|
# and their drivers don't have the kill-switch problem nvidia has.
|
||||||
|
# ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
# Collect existing IDs (if any)
|
# Collect existing IDs (if any)
|
||||||
local existing_ids=()
|
local existing_ids=()
|
||||||
local existing_line
|
local existing_line
|
||||||
@@ -1671,12 +1733,13 @@ blacklist_gpu_drivers() {
|
|||||||
|
|
||||||
case "$SELECTED_GPU" in
|
case "$SELECTED_GPU" in
|
||||||
nvidia)
|
nvidia)
|
||||||
|
# Only blacklist the open-source `nouveau` driver — never the
|
||||||
|
# proprietary `nvidia` module. Blacklisting nvidia globally
|
||||||
|
# would kill any OTHER NVIDIA GPU that should stay on the host
|
||||||
|
# (multi-GPU NVIDIA scenarios). The VFIO binding for the GPUs
|
||||||
|
# passed through is handled by `proxmenux-vfio-bind` via per-BDF
|
||||||
|
# driver_override + softdep nvidia pre: vfio-pci.
|
||||||
_add_line_if_missing "blacklist nouveau" "$blacklist_file"
|
_add_line_if_missing "blacklist nouveau" "$blacklist_file"
|
||||||
_add_line_if_missing "blacklist nvidia" "$blacklist_file"
|
|
||||||
_add_line_if_missing "blacklist nvidia_drm" "$blacklist_file"
|
|
||||||
_add_line_if_missing "blacklist nvidia_modeset" "$blacklist_file"
|
|
||||||
_add_line_if_missing "blacklist nvidia_uvm" "$blacklist_file"
|
|
||||||
_add_line_if_missing "blacklist nvidiafb" "$blacklist_file"
|
|
||||||
_add_line_if_missing "blacklist lbm-nouveau" "$blacklist_file"
|
_add_line_if_missing "blacklist lbm-nouveau" "$blacklist_file"
|
||||||
_add_line_if_missing "options nouveau modeset=0" "$blacklist_file"
|
_add_line_if_missing "options nouveau modeset=0" "$blacklist_file"
|
||||||
;;
|
;;
|
||||||
@@ -1692,6 +1755,18 @@ blacklist_gpu_drivers() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
sanitize_nvidia_host_stack_for_vfio() {
|
sanitize_nvidia_host_stack_for_vfio() {
|
||||||
|
# In the new per-BDF model we only stop systemd services that could
|
||||||
|
# actively probe / lock GPUs at boot (persistenced) — but we DO NOT:
|
||||||
|
# - blacklist the nvidia kernel module
|
||||||
|
# - remove nvidia entries from /etc/modules
|
||||||
|
# - rename /etc/modules-load.d/nvidia-vfio.conf
|
||||||
|
# - rename /etc/udev/rules.d/70-nvidia.rules
|
||||||
|
# - create /etc/modprobe.d/nvidia-blacklist.conf with install /bin/false
|
||||||
|
# All of those were global and broke multi-GPU NVIDIA scenarios where
|
||||||
|
# one GPU goes to a VM (vfio-pci) and another stays on the host
|
||||||
|
# (nvidia driver). VFIO binding is now per-BDF via driver_override in
|
||||||
|
# an initramfs hook — the nvidia module stays usable for any GPU not
|
||||||
|
# explicitly targeted.
|
||||||
msg_info "$(translate 'Sanitizing NVIDIA host services for VFIO mode...')"
|
msg_info "$(translate 'Sanitizing NVIDIA host services for VFIO mode...')"
|
||||||
local changed=false
|
local changed=false
|
||||||
local state_dir="/var/lib/proxmenux"
|
local state_dir="/var/lib/proxmenux"
|
||||||
@@ -1730,46 +1805,21 @@ sanitize_nvidia_host_stack_for_vfio() {
|
|||||||
|
|
||||||
[[ -s "$state_file" ]] || rm -f "$state_file"
|
[[ -s "$state_file" ]] || rm -f "$state_file"
|
||||||
|
|
||||||
if [[ -f /etc/modules-load.d/nvidia-vfio.conf ]]; then
|
|
||||||
mv /etc/modules-load.d/nvidia-vfio.conf /etc/modules-load.d/nvidia-vfio.conf.proxmenux-disabled-vfio >>"$LOG_FILE" 2>&1 || true
|
|
||||||
changed=true
|
|
||||||
fi
|
|
||||||
|
|
||||||
if grep -qE '^(nvidia|nvidia_uvm|nvidia_drm|nvidia_modeset)$' /etc/modules 2>/dev/null; then
|
|
||||||
sed -i '/^nvidia$/d;/^nvidia_uvm$/d;/^nvidia_drm$/d;/^nvidia_modeset$/d' /etc/modules
|
|
||||||
changed=true
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Disable NVIDIA udev rules that trigger nvidia-smi (causes conflict with vfio-pci)
|
|
||||||
local udev_rules="/etc/udev/rules.d/70-nvidia.rules"
|
|
||||||
if [[ -f "$udev_rules" ]]; then
|
|
||||||
mv "$udev_rules" "${udev_rules}.proxmenux-disabled" >>"$LOG_FILE" 2>&1 || true
|
|
||||||
udevadm control --reload-rules >>"$LOG_FILE" 2>&1 || true
|
|
||||||
changed=true
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Create hard blacklist to prevent ANY nvidia module loading (even via modprobe/nvidia-smi)
|
|
||||||
local nvidia_blacklist="/etc/modprobe.d/nvidia-blacklist.conf"
|
|
||||||
if [[ ! -f "$nvidia_blacklist" ]]; then
|
|
||||||
cat > "$nvidia_blacklist" <<'EOF'
|
|
||||||
# ProxMenux: Hard blacklist to prevent ANY nvidia module loading in VFIO mode
|
|
||||||
# This prevents nvidia-smi and other tools from triggering module load attempts
|
|
||||||
install nvidia /bin/false
|
|
||||||
install nvidia_uvm /bin/false
|
|
||||||
install nvidia_drm /bin/false
|
|
||||||
install nvidia_modeset /bin/false
|
|
||||||
EOF
|
|
||||||
changed=true
|
|
||||||
fi
|
|
||||||
|
|
||||||
if $changed; then
|
if $changed; then
|
||||||
HOST_CONFIG_CHANGED=true
|
HOST_CONFIG_CHANGED=true
|
||||||
msg_ok "$(translate 'NVIDIA host services/autoload disabled for VFIO mode')" | tee -a "$screen_capture"
|
msg_ok "$(translate 'NVIDIA host services disabled for VFIO mode')" | tee -a "$screen_capture"
|
||||||
else
|
else
|
||||||
msg_ok "$(translate 'NVIDIA host services/autoload already aligned for VFIO mode')" | tee -a "$screen_capture"
|
msg_ok "$(translate 'NVIDIA host services already aligned for VFIO mode')" | tee -a "$screen_capture"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Per-BDF VFIO binder + legacy NVIDIA blacklist migration are defined in
|
||||||
|
# scripts/global/pci_passthrough_helpers.sh (sourced at the top of this file).
|
||||||
|
# Functions exposed there:
|
||||||
|
# _proxmenux_vfio_bind_add_bdfs <bdf...>
|
||||||
|
# _proxmenux_vfio_bind_remove_bdfs <bdf...>
|
||||||
|
# _proxmenux_nvidia_migrate_legacy_blacklist
|
||||||
|
|
||||||
|
|
||||||
# ── AMD ROM dump: sysfs first, VFCT ACPI table as fallback ───────────────
|
# ── AMD ROM dump: sysfs first, VFCT ACPI table as fallback ───────────────
|
||||||
_dump_rom_via_vfct() {
|
_dump_rom_via_vfct() {
|
||||||
@@ -2187,6 +2237,12 @@ main() {
|
|||||||
msg_title "${run_title}"
|
msg_title "${run_title}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Auto-migrate any leftover state from the previous (broken) global
|
||||||
|
# NVIDIA blacklist model BEFORE applying new config. Idempotent: no-op
|
||||||
|
# on clean hosts. Always runs in the NVIDIA flow so a host that was
|
||||||
|
# configured with an old ProxMenux release self-heals on the next run.
|
||||||
|
[[ "$SELECTED_GPU" == "nvidia" ]] && _proxmenux_nvidia_migrate_legacy_blacklist
|
||||||
|
|
||||||
if [[ "$VM_SWITCH_ALREADY_VFIO" == "true" ]]; then
|
if [[ "$VM_SWITCH_ALREADY_VFIO" == "true" ]]; then
|
||||||
msg_ok "$(translate 'Host already in VFIO mode — skipping host reconfiguration for VM reassignment')" | tee -a "$screen_capture"
|
msg_ok "$(translate 'Host already in VFIO mode — skipping host reconfiguration for VM reassignment')" | tee -a "$screen_capture"
|
||||||
else
|
else
|
||||||
|
|||||||
@@ -346,6 +346,13 @@ _restore_nvidia_host_stack_for_lxc() {
|
|||||||
local disabled_file="/etc/modules-load.d/nvidia-vfio.conf.proxmenux-disabled-vfio"
|
local disabled_file="/etc/modules-load.d/nvidia-vfio.conf.proxmenux-disabled-vfio"
|
||||||
local active_file="/etc/modules-load.d/nvidia-vfio.conf"
|
local active_file="/etc/modules-load.d/nvidia-vfio.conf"
|
||||||
|
|
||||||
|
# New per-BDF model: drop every NVIDIA BDF from the initramfs binder so
|
||||||
|
# the nvidia module reclaims the GPU after the next reboot. Idempotent:
|
||||||
|
# no-op if no NVIDIA BDFs are tracked. Vendor 10de = NVIDIA.
|
||||||
|
if declare -F _proxmenux_vfio_bind_purge_vendor >/dev/null 2>&1; then
|
||||||
|
_proxmenux_vfio_bind_purge_vendor "10de" && changed=true
|
||||||
|
fi
|
||||||
|
|
||||||
# Remove hard blacklist that was preventing nvidia module loading
|
# Remove hard blacklist that was preventing nvidia module loading
|
||||||
local nvidia_blacklist="/etc/modprobe.d/nvidia-blacklist.conf"
|
local nvidia_blacklist="/etc/modprobe.d/nvidia-blacklist.conf"
|
||||||
if [[ -f "$nvidia_blacklist" ]]; then
|
if [[ -f "$nvidia_blacklist" ]]; then
|
||||||
|
|||||||
@@ -324,6 +324,12 @@ _restore_nvidia_host_stack_for_lxc() {
|
|||||||
local disabled_file="/etc/modules-load.d/nvidia-vfio.conf.proxmenux-disabled-vfio"
|
local disabled_file="/etc/modules-load.d/nvidia-vfio.conf.proxmenux-disabled-vfio"
|
||||||
local active_file="/etc/modules-load.d/nvidia-vfio.conf"
|
local active_file="/etc/modules-load.d/nvidia-vfio.conf"
|
||||||
|
|
||||||
|
# New per-BDF model: drop every NVIDIA BDF from the initramfs binder so
|
||||||
|
# the nvidia module reclaims the GPU after the next reboot. Idempotent.
|
||||||
|
if declare -F _proxmenux_vfio_bind_purge_vendor >/dev/null 2>&1; then
|
||||||
|
_proxmenux_vfio_bind_purge_vendor "10de" && changed=true
|
||||||
|
fi
|
||||||
|
|
||||||
# Remove hard blacklist that was preventing nvidia module loading
|
# Remove hard blacklist that was preventing nvidia module loading
|
||||||
local nvidia_blacklist="/etc/modprobe.d/nvidia-blacklist.conf"
|
local nvidia_blacklist="/etc/modprobe.d/nvidia-blacklist.conf"
|
||||||
if [[ -f "$nvidia_blacklist" ]]; then
|
if [[ -f "$nvidia_blacklist" ]]; then
|
||||||
|
|||||||
Reference in New Issue
Block a user