GPU passthrough (NV h100)case vfio Error
Host environment
- Operating system: ubuntu 22.04,redhat linux 9,centos9
- OS/kernel version: 5.19.0-45
- Architecture: x86,
- QEMU flavor: qemu-system-x86_64
- QEMU version: 6.2.0 8.0.2
- QEMU command line:
2023-06-20 07:14:23.004+0000: starting up libvirt version: 8.0.0, package: 1ubuntu7.5 (Marc Deslauriers <marc.deslauriers@ubuntu.com> Fri, 26 May 2023 10:08:33 -0400), qemu version: 6.2.0Debian 1:6.2+dfsg-2ubuntu6.11, kernel: 5.19.0-45-generic, hostname: dev-gn-i12004
LC_ALL=C \
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin \
HOME=/var/lib/libvirt/qemu/domain-11-h100vm-uefi \
XDG_DATA_HOME=/var/lib/libvirt/qemu/domain-11-h100vm-uefi/.local/share \
XDG_CACHE_HOME=/var/lib/libvirt/qemu/domain-11-h100vm-uefi/.cache \
XDG_CONFIG_HOME=/var/lib/libvirt/qemu/domain-11-h100vm-uefi/.config \
/usr/bin/qemu-system-x86_64 \
-name guest=h100vm-uefi,debug-threads=on \
-S \
-object '{"qom-type":"secret","id":"masterKey0","format":"raw","file":"/var/lib/libvirt/qemu/domain-11-h100vm-uefi/master-key.aes"}' \
-blockdev '{"driver":"file","filename":"/usr/share/OVMF/OVMF_GMO.fd","node-name":"libvirt-pflash0-storage","auto-read-only":true,"discard":"unmap"}' \
-blockdev '{"node-name":"libvirt-pflash0-format","read-only":true,"driver":"raw","file":"libvirt-pflash0-storage"}' \
-blockdev '{"driver":"file","filename":"/var/lib/libvirt/qemu/nvram/h100vm-uefi_VARS.fd","node-name":"libvirt-pflash1-storage","auto-read-only":true,"discard":"unmap"}' \
-blockdev '{"node-name":"libvirt-pflash1-format","read-only":false,"driver":"raw","file":"libvirt-pflash1-storage"}' \
-machine pc-q35-6.2,usb=off,vmport=off,dump-guest-core=off,pflash0=libvirt-pflash0-format,pflash1=libvirt-pflash1-format,memory-backend=pc.ram \
-accel kvm \
-cpu host,migratable=on \
-m 131072 \
-object '{"qom-type":"memory-backend-ram","id":"pc.ram","size":137438953472}' \
-overcommit mem-lock=off \
-smp 22,sockets=22,cores=1,threads=1 \
-uuid dc417e9c-add7-4730-bd92-d583c5abe740 \
-no-user-config \
-nodefaults \
-chardev socket,id=charmonitor,fd=33,server=on,wait=off \
-mon chardev=charmonitor,id=monitor,mode=control \
-rtc base=utc,driftfix=slew \
-global kvm-pit.lost_tick_policy=delay \
-no-hpet \
-no-shutdown \
-global ICH9-LPC.disable_s3=1 \
-global ICH9-LPC.disable_s4=1 \
-boot strict=on \
-device pcie-root-port,port=16,chassis=1,id=pci.1,bus=pcie.0,multifunction=on,addr=0x2 \
-device pcie-root-port,port=17,chassis=2,id=pci.2,bus=pcie.0,addr=0x2.0x1 \
-device pcie-root-port,port=18,chassis=3,id=pci.3,bus=pcie.0,addr=0x2.0x2 \
-device pcie-root-port,port=19,chassis=4,id=pci.4,bus=pcie.0,addr=0x2.0x3 \
-device pcie-root-port,port=20,chassis=5,id=pci.5,bus=pcie.0,addr=0x2.0x4 \
-device pcie-root-port,port=21,chassis=6,id=pci.6,bus=pcie.0,addr=0x2.0x5 \
-device pcie-root-port,port=22,chassis=7,id=pci.7,bus=pcie.0,addr=0x2.0x6 \
-device pcie-root-port,port=23,chassis=8,id=pci.8,bus=pcie.0,addr=0x2.0x7 \
-device pcie-root-port,port=24,chassis=9,id=pci.9,bus=pcie.0,multifunction=on,addr=0x3 \
-device pcie-root-port,port=25,chassis=10,id=pci.10,bus=pcie.0,addr=0x3.0x1 \
-device pcie-root-port,port=26,chassis=11,id=pci.11,bus=pcie.0,addr=0x3.0x2 \
-device pcie-root-port,port=27,chassis=12,id=pci.12,bus=pcie.0,addr=0x3.0x3 \
-device pcie-root-port,port=28,chassis=13,id=pci.13,bus=pcie.0,addr=0x3.0x4 \
-device pcie-root-port,port=29,chassis=14,id=pci.14,bus=pcie.0,addr=0x3.0x5 \
-device qemu-xhci,p2=15,p3=15,id=usb,bus=pci.2,addr=0x0 \
-device virtio-scsi-pci,id=scsi0,bus=pci.7,addr=0x0 \
-device virtio-serial-pci,id=virtio-serial0,bus=pci.3,addr=0x0 \
-blockdev '{"driver":"file","filename":"/var/lib/nova/instances/h100vm-uefi/h100vm-uefi.qcow2","node-name":"libvirt-2-storage","auto-read-only":true,"discard":"unmap"}' \
-blockdev '{"node-name":"libvirt-2-format","read-only":false,"driver":"qcow2","file":"libvirt-2-storage","backing":null}' \
-device virtio-blk-pci,bus=pci.4,addr=0x0,drive=libvirt-2-format,id=virtio-disk0,bootindex=1 \
-device ide-cd,bus=ide.0,id=sata0-0-0 \
-netdev tap,fd=34,id=hostnet0,vhost=on,vhostfd=36 \
-device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:38:35:81,bus=pci.1,addr=0x0 \
-chardev pty,id=charserial0 \
-device isa-serial,chardev=charserial0,id=serial0 \
-chardev socket,id=charchannel0,fd=31,server=on,wait=off \
-device virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,id=channel0,name=org.qemu.guest_agent.0 \
-chardev spicevmc,id=charchannel1,name=vdagent \
-device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel1,id=channel1,name=com.redhat.spice.0 \
-device usb-tablet,id=input0,bus=usb.0,port=1 \
-audiodev '{"id":"audio1","driver":"none"}' \
-vnc 127.0.0.1:0,audiodev=audio1 \
-device VGA,id=video0,vgamem_mb=16,bus=pcie.0,addr=0x1 \
-device ich9-intel-hda,id=sound0,bus=pcie.0,addr=0x1b \
-device hda-duplex,id=sound0-codec0,bus=sound0.0,cad=0,audiodev=audio1 \
-chardev spicevmc,id=charredir0,name=usbredir \
-device usb-redir,chardev=charredir0,id=redir0,bus=usb.0,port=2 \
-chardev spicevmc,id=charredir1,name=usbredir \
-device usb-redir,chardev=charredir1,id=redir1,bus=usb.0,port=3 \
-device vfio-pci,host=0000:17:00.0,id=hostdev0,bus=pci.8,addr=0x0 \
-device virtio-balloon-pci,id=balloon0,bus=pci.5,addr=0x0 \
-object '{"qom-type":"rng-random","id":"objrng0","filename":"/dev/urandom"}' \
-device virtio-rng-pci,rng=objrng0,id=rng0,bus=pci.6,addr=0x0 \
-sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny \
-msg timestamp=on
Emulated/Virtualized environment
- Operating system: ubuntu22.04
- OS/kernel version: 5.19
- Architecture: x86
Description of problem
GPU passthrough (NV h100) will case a error
qemu-system-x86_64: vfio_err_notifier_handler(0000:17:00.0) Unrecoverable error detected. Please collect any data possible and then kill the guest
this error happen in centos, redhat linux,ubuntu with some kernel i have try( 5.19.0,6.0,6.2) The same server insert L4,L40 GPU, will not happen. Only happen on H100 GPU The same server install esxios. everything is normal. GPU work fine
With vfio error. there is some idrac log error on my dell server
A bus fatal error was detected on a component at slot 2. Tue Jun 20 2023 05:51:51
A fatal error was detected on a component at bus 23 device 0 function 0. Tue Jun 20 2023 05:51:51
A fatal error was detected on a component at bus 22 device 2 function 0. Tue Jun 20 2023 05:51:51
Otherwise, I have try to passthrough gpu on dell amd and intel server both. With AMD CPU , gpu not working in vm. but will not case vfio error With INTEL CPU, will case vfio error.
Steps to reproduce
- Set GPU passthrought
- Start VM
- Do something in vm
Additional information
Edited by zhou jielei