QEMU monitor hangs after "stop" QMP command called in postcopy-paused migration state
Host environment
- Operating system: Fedora 35
- OS/kernel version: Linux f35-2.virt 5.15.5-200.fc35.x86_64 #1 SMP Fri Nov 26 00:46:42 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
- Architecture: x86_64
- QEMU flavor: qemu-system-x86_64
- QEMU version: 7.0.0
- QEMU command line:
2022-06-01 11:28:09.921+0000: starting up libvirt version: 8.5.0, qemu version: 7.0.0, kernel: 5.15.5-200.fc35.x86_64, hostname: f35-2.virt LC_ALL=C \ PATH=/root/.local/bin:/root/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin \ HOME=/var/lib/libvirt/qemu/domain-1-nest \ USER=root \ LOGNAME=root \ XDG_DATA_HOME=/var/lib/libvirt/qemu/domain-1-nest/.local/share \ XDG_CACHE_HOME=/var/lib/libvirt/qemu/domain-1-nest/.cache \ XDG_CONFIG_HOME=/var/lib/libvirt/qemu/domain-1-nest/.config \ /home/qemu-7.0.0/build/qemu-system-x86_64 \ -name guest=nest,debug-threads=on \ -S \ -object '{"qom-type":"secret","id":"masterKey0","format":"raw","file":"/var/lib/libvirt/qemu/domain-1-nest/master-key.aes"}' \ -machine pc-i440fx-6.1,usb=off,vmport=off,dump-guest-core=off,memory-backend=pc.ram,graphics=off \ -accel kvm \ -cpu host,migratable=on \ -m 512 \ -object '{"qom-type":"memory-backend-ram","id":"pc.ram","size":536870912}' \ -overcommit mem-lock=off \ -smp 2,sockets=2,cores=1,threads=1 \ -uuid 9d05d8a7-ee74-4a2d-b4c4-4ec513273d92 \ -no-user-config \ -nodefaults \ -chardev socket,id=charmonitor,fd=36,server=on,wait=off \ -mon chardev=charmonitor,id=monitor,mode=control \ -rtc base=utc,driftfix=slew \ -global kvm-pit.lost_tick_policy=delay \ -no-hpet \ -no-shutdown \ -global PIIX4_PM.disable_s3=1 \ -global PIIX4_PM.disable_s4=1 \ -boot strict=on \ -device '{"driver":"ich9-usb-ehci1","id":"usb","bus":"pci.0","addr":"0x5.0x7"}' \ -device '{"driver":"ich9-usb-uhci1","masterbus":"usb.0","firstport":0,"bus":"pci.0","multifunction":true,"addr":"0x5"}' \ -device '{"driver":"ich9-usb-uhci2","masterbus":"usb.0","firstport":2,"bus":"pci.0","addr":"0x5.0x1"}' \ -device '{"driver":"ich9-usb-uhci3","masterbus":"usb.0","firstport":4,"bus":"pci.0","addr":"0x5.0x2"}' \ -device '{"driver":"virtio-serial-pci","id":"virtio-serial0","bus":"pci.0","addr":"0x4"}' \ -blockdev '{"driver":"file","filename":"/vm/nest.qcow","aio":"native","node-name":"libvirt-1-storage","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \ -blockdev '{"node-name":"libvirt-1-format","read-only":false,"cache":{"direct":true,"no-flush":false},"driver":"qcow2","file":"libvirt-1-storage","backing":null}' \ -device '{"driver":"virtio-blk-pci","bus":"pci.0","addr":"0x6","drive":"libvirt-1-format","id":"virtio-disk0","bootindex":1,"write-cache":"on"}' \ -netdev tap,fd=37,vhost=on,vhostfd=39,id=hostnet0 \ -device '{"driver":"virtio-net-pci","netdev":"hostnet0","id":"net0","mac":"52:54:00:59:59:91","bus":"pci.0","addr":"0x3"}' \ -chardev pty,id=charserial0 \ -device '{"driver":"isa-serial","chardev":"charserial0","id":"serial0","index":0}' \ -chardev socket,id=charchannel0,fd=35,server=on,wait=off \ -device '{"driver":"virtserialport","bus":"virtio-serial0.0","nr":1,"chardev":"charchannel0","id":"channel0","name":"org.qemu.guest_agent.0"}' \ -device '{"driver":"usb-tablet","id":"input0","bus":"usb.0","port":"1"}' \ -audiodev '{"id":"audio1","driver":"none"}' \ -vnc 0.0.0.0:0,audiodev=audio1 \ -device '{"driver":"VGA","id":"video0","vgamem_mb":16,"bus":"pci.0","addr":"0x2"}' \ -incoming defer \ -msg timestamp=on
Emulated/Virtualized environment
- Operating system: RHEL 7.8
- OS/kernel version: Linux nest.virt 3.10.0-1127.19.1.el7.x86_64 #1 SMP Tue Aug 11 19:12:04 EDT 2020 x86_64 x86_64 x86_64 GNU/Linux
- Architecture: x86
Description of problem
QEMU monitor hangs when I try to pause virtual CPUs using "stop" QMP command on the destination host once migration enters postcopy-paused (after it was paused using "migrate-pause" QMP command on the source host). QEMU just does not send any reply to the "stop" command.
Steps to reproduce
- start migration
- wait for the first iteration to finish
- switch to post-copy using "migrate-start-postcopy"
- break migration with "migrate-pause"
- send "stop" to the destination monitor
Additional information
Unfortunately I haven't been able to get a stack trace as gdb just hangs when I try to attach it to QEMU after step 4. I can see threads getting SIGUSR1 after the "stop" command, but I cannot get to gdb prompt afterwards:
(gdb) c
Continuing.
[New Thread 0x7f41ec9be640 (LWP 1112)]
[New Thread 0x7f41d7fff640 (LWP 1113)]
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
Thread 5 "CPU 1/KVM" received signal SIGUSR1, User defined signal 1.
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
Thread 5 "CPU 1/KVM" received signal SIGUSR1, User defined signal 1.
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
Thread 5 "CPU 1/KVM" received signal SIGUSR1, User defined signal 1.
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
Thread 5 "CPU 1/KVM" received signal SIGUSR1, User defined signal 1.
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
Thread 5 "CPU 1/KVM" received signal SIGUSR1, User defined signal 1.
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
Thread 4 "CPU 0/KVM" received signal SIGUSR1, User defined signal 1.
I was able to attach strace to it though (in case it is at least a bit useful). The first line corresponds to the final '}' of the {"execute":"stop","id":"libvirt-413"} QMP comamnd:
[pid 72970] recvmsg(20, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="}", iov_len=1}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 1
[pid 72970] write(4, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 72949] <... ppoll resumed>) = 1 ([{fd=4, revents=POLLIN}], left {tv_sec=0, tv_nsec=513181335})
[pid 72970] write(19, "\1\0\0\0\0\0\0\0", 8 <unfinished ...>
[pid 72949] read(4, <unfinished ...>
[pid 72970] <... write resumed>) = 8
[pid 72949] <... read resumed>"\1\0\0\0\0\0\0\0", 512) = 8
[pid 72970] write(19, "\1\0\0\0\0\0\0\0", 8 <unfinished ...>
[pid 72949] ppoll([{fd=4, events=POLLIN}, {fd=5, events=POLLIN}, {fd=8, events=POLLIN}, {fd=9, events=POLLIN}, {fd=11, events=POLLIN}, {fd=12, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=29, events=POLLIN}, {fd=30, events=POLLIN}, {fd=31, events=POLLIN}, {fd=32, events=POLLIN}, {fd=33, events=POLLIN}, {fd=34, events=POLLIN}, {fd=38, events=POLLIN}, {fd=40, events=POLLIN}, {fd=41, events=POLLIN}, {fd=42, events=POLLIN}, {fd=43, events=POLLIN}, {fd=44, events=POLLIN}, {fd=45, events=POLLIN}, {fd=46, events=POLLIN}, {fd=47, events=POLLIN}, {fd=48, events=POLLIN}, {fd=49, events=POLLIN}, {fd=50, events=POLLIN}, {fd=51, events=POLLIN}, {fd=52, events=POLLIN}, {fd=53, events=POLLIN}, {fd=54, events=POLLIN}, {fd=55, events=POLLIN}, {fd=56, events=POLLIN}, ...], 74, {tv_sec=0, tv_nsec=0}, NULL, 8 <unfinished ...>
[pid 72970] <... write resumed>) = 8
[pid 72949] <... ppoll resumed>) = 0 (Timeout)
[pid 72970] write(19, "\1\0\0\0\0\0\0\0", 8 <unfinished ...>
[pid 72949] write(8, "\1\0\0\0\0\0\0\0", 8 <unfinished ...>
[pid 72970] <... write resumed>) = 8
[pid 72949] <... write resumed>) = 8
[pid 72970] write(19, "\1\0\0\0\0\0\0\0", 8 <unfinished ...>
[pid 72949] ppoll([{fd=4, events=POLLIN}, {fd=5, events=POLLIN}, {fd=8, events=POLLIN}, {fd=9, events=POLLIN}, {fd=11, events=POLLIN}, {fd=12, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=29, events=POLLIN}, {fd=30, events=POLLIN}, {fd=31, events=POLLIN}, {fd=32, events=POLLIN}, {fd=33, events=POLLIN}, {fd=34, events=POLLIN}, {fd=38, events=POLLIN}, {fd=40, events=POLLIN}, {fd=41, events=POLLIN}, {fd=42, events=POLLIN}, {fd=43, events=POLLIN}, {fd=44, events=POLLIN}, {fd=45, events=POLLIN}, {fd=46, events=POLLIN}, {fd=47, events=POLLIN}, {fd=48, events=POLLIN}, {fd=49, events=POLLIN}, {fd=50, events=POLLIN}, {fd=51, events=POLLIN}, {fd=52, events=POLLIN}, {fd=53, events=POLLIN}, {fd=54, events=POLLIN}, {fd=55, events=POLLIN}, {fd=56, events=POLLIN}, ...], 74, {tv_sec=0, tv_nsec=0}, NULL, 8 <unfinished ...>
[pid 72970] <... write resumed>) = 8
[pid 72949] <... ppoll resumed>) = 1 ([{fd=8, revents=POLLIN}], left {tv_sec=0, tv_nsec=0})
[pid 72970] poll([{fd=18, events=POLLIN}, {fd=19, events=POLLIN}, {fd=20, events=0}], 3, -1 <unfinished ...>
[pid 72949] rt_sigprocmask(SIG_BLOCK, ~[], <unfinished ...>
[pid 72970] <... poll resumed>) = 1 ([{fd=19, revents=POLLIN}])
[pid 72949] <... rt_sigprocmask resumed>[BUS USR1 ALRM IO], 8) = 0
[pid 72970] read(19, <unfinished ...>
[pid 72949] getpid() = 72949
[pid 72970] <... read resumed>"\5\0\0\0\0\0\0\0", 16) = 8
[pid 72949] tgkill(72949, 72971, SIGUSR1 <unfinished ...>
[pid 72970] poll([{fd=18, events=POLLIN}, {fd=19, events=POLLIN}, {fd=20, events=0}], 3, -1 <unfinished ...>
[pid 72949] <... tgkill resumed>) = 0
[pid 72949] rt_sigprocmask(SIG_SETMASK, [BUS USR1 ALRM IO], NULL, 8) = 0
[pid 72949] rt_sigprocmask(SIG_BLOCK, ~[], [BUS USR1 ALRM IO], 8) = 0
[pid 72949] getpid() = 72949
[pid 72949] tgkill(72949, 72972, SIGUSR1) = 0
[pid 72949] rt_sigprocmask(SIG_SETMASK, [BUS USR1 ALRM IO], NULL, 8) = 0
[pid 72949] futex(0x5606f6cb73a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, NULL, FUTEX_BITSET_MATCH_ANY
And that's it, the last futex never returns.
Edited by Jiri Denemark