Live migration with TLS fail (GNUTLS AUTO_REKEY)
Host environment
- Operating system: Ubuntu 22.04.3 LTS
- OS/kernel version: 6.2.0-26-generic
- Architecture: x86
- QEMU flavor: qemu-system-x86_64
- QEMU version: 6.2 and 7.2.5
- QEMU command line:
/usr/bin/qemu-system-x86_64 -name guest=instance-00008c3d,debug-threads=on -S -object {"qom-type":"secret","id":"masterKey0","format":"raw","file":"/var/lib/libvirt/qemu/domain-1-instance-00008c3d/master-key.aes"} -machine pc-q35-6.2,usb=off,dump-guest-core=off,memory-backend=pc.ram -accel kvm -cpu host,migratable=on -m 16384 -object {"qom-type":"memory-backend-ram","id":"pc.ram","size":17179869184} -overcommit mem-lock=off -smp 4,sockets=4,dies=1,cores=1,threads=1 -object {"qom-type":"iothread","id":"iothread1"} -object {"qom-type":"iothread","id":"iothread2"} -uuid f3166bd6-33dc-4e3e-afa7-4100754a580a -smbios type=1,manufacturer=OpenStack Foundation,product=OpenStack Nova,version=25.2.0,serial=f3166bd6-33dc-4e3e-afa7-4100754a580a,uuid=f3166bd6-33dc-4e3e-afa7-4100754a580a,family=Virtual Machine -no-user-config -nodefaults -chardev socket,id=charmonitor,fd=33,server=on,wait=off -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc,driftfix=slew -global kvm-pit.lost_tick_policy=delay -no-hpet -no-shutdown -boot strict=on -device {"driver":"pcie-root-port","port":16,"chassis":1,"id":"pci.1","bus":"pcie.0","multifunction":true,"addr":"0x2"} -device {"driver":"pcie-root-port","port":17,"chassis":2,"id":"pci.2","bus":"pcie.0","addr":"0x2.0x1"} .... -device {"driver":"piix3-usb-uhci","id":"usb","bus":"pci.30","addr":"0x1"} -object {"qom-type":"secret","id":"libvirt-1-storage-auth-secret0","data":"xxxxxxx=","keyid":"masterKey0","iv":"xxxxxx","format":"base64"} -blockdev {"driver":"rbd","pool":"cinder_oa1_ec","image":"volume-42bea012-7a72-4005-abe9-399807773af9","server":[{"host":"10.x.x.x","port":"6789"},{"host":"10.x.x.x","port":"6789"},{"host":"10.x.x.x","port":"6789"},{"host":"10.x.x.x","port":"6789"},{"host":"10.x.x.x","port":"6789"},{"host":"10.x.x.x","port":"6789"}],"user":"cinder_oa1_ec","auth-client-required":["cephx","none"],"key-secret":"libvirt-1-storage-auth-secret0","node-name":"libvirt-1-storage","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"} -blockdev {"node-name":"libvirt-1-format","read-only":false,"discard":"unmap","cache":{"direct":true,"no-flush":false},"driver":"raw","file":"libvirt-1-storage"} -device {"driver":"virtio-blk-pci","iothread":"iothread1","bus":"pci.3","addr":"0x0","drive":"libvirt-1-format","id":"virtio-disk0","bootindex":1,"write-cache":"on","serial":"42bea012-7a72-4005-abe9-399807773af9"} -netdev {"type":"tap","fds":"35:37:38:39","vhost":true,"vhostfds":"40:41:42:43","id":"hostnet0"} -device {"driver":"virtio-net-pci","mq":true,"vectors":10,"host_mtu":1500,"netdev":"hostnet0","id":"net0","mac":"fa:16:3e:de:10:48","bus":"pci.2","addr":"0x0"} -add-fd set=0,fd=34,opaque=serial0-log -chardev pty,id=charserial0,logfile=/dev/fdset/0,logappend=on -device {"driver":"isa-serial","chardev":"charserial0","id":"serial0","index":0} -device {"driver":"usb-tablet","id":"input0","bus":"usb.0","port":"1"} -audiodev {"id":"audio1","driver":"none"} -vnc 10.x.x.x:0,audiodev=audio1 -device {"driver":"virtio-vga","id":"video0","max_outputs":1,"bus":"pcie.0","addr":"0x1"} -device {"driver":"virtio-balloon-pci","id":"balloon0","bus":"pci.4","addr":"0x0"} -object {"qom-type":"rng-random","id":"objrng0","filename":"/dev/urandom"} -device {"driver":"virtio-rng-pci","rng":"objrng0","id":"rng0","bus":"pci.5","addr":"0x0"} -device {"driver":"vmcoreinfo"} -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny -msg timestamp=on
Emulated/Virtualized environment
- Operating system: Ubuntu 22.04
- OS/kernel version: 6.2
- Architecture: x86
Description of problem
Live migration with TLS fail in postcopy stage when:
migrate-start-postcopy
is executed after approximately 80sec from migration start.
A) Last events from src point of view: (libvirt log)
2023-10-12 15:10:22.087+0000: 587026: info : qemuMonitorSend:861 : QEMU_MONITOR_SEND_MSG: mon=0x7f18740bc5d0 msg={"execute":"migrate-start-postcopy","id":"libvirt-939"}
2023-10-12 15:10:22.088+0000: 588063: info : qemuMonitorJSONIOProcessLine:208 : QEMU_MONITOR_RECV_REPLY: mon=0x7f18740bc5d0 reply={"return": {}, "id": "libvirt-939"}
2023-10-12 15:10:22.166+0000: 588063: info : qemuMonitorJSONIOProcessLine:203 : QEMU_MONITOR_RECV_EVENT: mon=0x7f18740bc5d0 event={"timestamp": {"seconds": 1697123422, "microseconds": 166533}, "event": "STOP"}
2023-10-12 15:10:22.167+0000: 588063: info : qemuMonitorJSONIOProcessLine:203 : QEMU_MONITOR_RECV_EVENT: mon=0x7f18740bc5d0 event={"timestamp": {"seconds": 1697123422, "microseconds": 167893}, "event": "MIGRATION", "data": {"status": "pre-switchover"}}
2023-10-12 15:10:22.168+0000: 587024: info : qemuMonitorSend:861 : QEMU_MONITOR_SEND_MSG: mon=0x7f18740bc5d0 msg={"execute":"query-migrate","id":"libvirt-940"}
2023-10-12 15:10:22.168+0000: 588063: info : qemuMonitorJSONIOProcessLine:208 : QEMU_MONITOR_RECV_REPLY: mon=0x7f18740bc5d0 reply={"return": {"expected-downtime": 23658, "status": "pre-switchover", "setup-time": 38, "total-time": 256043, "ram": {"total": 17189380096, "postcopy-requests": 0, "dirty-sync-count": 10, "multifd-bytes": 0, "pages-per-second": 133039, "page-size": 4096, "remaining": 5598875648, "mbps": 4367.9463529411769, "transferred": 124973336291, "duplicate": 492617, "dirty-pages-rate": 120585, "skipped": 0, "normal-bytes": 124725260288, "normal": 30450503}}, "id": "libvirt-940"}
2023-10-12 15:10:22.168+0000: 587024: info : qemuMonitorSend:861 : QEMU_MONITOR_SEND_MSG: mon=0x7f18740bc5d0 msg={"execute":"migrate-continue","arguments":{"state":"pre-switchover"},"id":"libvirt-941"}
2023-10-12 15:10:22.169+0000: 588063: info : qemuMonitorJSONIOProcessLine:208 : QEMU_MONITOR_RECV_REPLY: mon=0x7f18740bc5d0 reply={"return": {}, "id": "libvirt-941"}
2023-10-12 15:10:22.169+0000: 588063: info : qemuMonitorJSONIOProcessLine:203 : QEMU_MONITOR_RECV_EVENT: mon=0x7f18740bc5d0 event={"timestamp": {"seconds": 1697123422, "microseconds": 168996}, "event": "MIGRATION", "data": {"status": "postcopy-active"}}
2023-10-12 15:10:22.169+0000: 587024: info : qemuMonitorSend:861 : QEMU_MONITOR_SEND_MSG: mon=0x7f18740bc5d0 msg={"execute":"query-migrate","id":"libvirt-942"}
2023-10-12 15:10:22.169+0000: 588063: info : qemuMonitorJSONIOProcessLine:208 : QEMU_MONITOR_RECV_REPLY: mon=0x7f18740bc5d0 reply={"return": {"expected-downtime": 23658, "status": "postcopy-active", "setup-time": 38, "total-time": 256044, "ram": {"total": 17189380096, "postcopy-requests": 0, "dirty-sync-count": 10, "multifd-bytes": 0, "pages-per-second": 133039, "page-size": 4096, "remaining": 5598875648, "mbps": 4367.9463529411769, "transferred": 124973336291, "duplicate": 492617, "dirty-pages-rate": 120585, "skipped": 0, "normal-bytes": 124725260288, "normal": 30450503}}, "id": "libvirt-942"}
2023-10-12 15:10:22.171+0000: 588063: info : qemuMonitorJSONIOProcessLine:203 : QEMU_MONITOR_RECV_EVENT: mon=0x7f18740bc5d0 event={"timestamp": {"seconds": 1697123422, "microseconds": 171487}, "event": "MIGRATION_PASS", "data": {"pass": 11}}
2023-10-12 15:10:22.738+0000: 587024: error : virNetClientProgramDispatchError:172 : internal error: qemu unexpectedly closed the monitor: 2023-10-12T15:10:22.288516Z qemu-system-x86_64: Cannot read from TLS channel: Input/output error
2023-10-12T15:10:22.288542Z qemu-system-x86_64: error while loading state section id 3(ram)
2023-10-12T15:10:22.288549Z qemu-system-x86_64: postcopy_ram_listen_thread: loadvm failed: -5
2023-10-12 15:10:24.671+0000: 587024: warning : qemuMigrationSrcPostcopyFailed:1676 : Migration of domain instance-00008c3d failed during post-copy; leaving the domain paused
migrate-start-postcopy
is executed right before 70sec from migration start.
B) In this case postcopy stage succesfuly start, but in a few seconds stop with postcopy-paused event. I this case it is possible recover postcopy and migration will end with success. The time range when this stop occurs is about the same as in the first case, 70-80s from the start. ("total-time": 76353)
Log:
2023-10-12 15:36:19.255+0000: 590673: info : qemuMonitorJSONIOProcessLine:208 : QEMU_MONITOR_RECV_REPLY: mon=0x7f2e5c0bd2f0 reply={"return": {"expected-downtime": 29870, "status": "postcopy-active", "setup-time": 7, "total-time": 76353, "ram": {"total": 17189380096, "postcopy-requests": 1123, "dirty-sync-count": 5, "multifd-bytes": 0, "pages-per-second": 162710, "page-size": 4096, "remaining": 6681395200, "mbps": 3063.6026915887851, "transferred": 34369665783, "duplicate": 3532623, "dirty-pages-rate": 182606, "skipped": 0, "normal-bytes": 34270924800, "normal": 8366925}}, "id": "libvirt-574"}
2023-10-12 15:36:19.454+0000: 590673: info : qemuMonitorJSONIOProcessLine:203 : QEMU_MONITOR_RECV_EVENT: mon=0x7f2e5c0bd2f0 event={"timestamp": {"seconds": 1697124979, "microseconds": 454116}, "event": "MIGRATION", "data": {"status": "postcopy-paused"}}
2023-10-12 15:36:19.454+0000: 590673: warning : qemuMigrationSrcPostcopyFailed:1676 : Migration of domain instance-00008c3d failed during post-copy; leaving the domain paused
2023-10-12 15:36:19.465+0000: 590559: error : virNetClientProgramDispatchError:172 : operation failed: job 'migration in' failed in post-copy phase
#and resume
2023-10-12 15:36:54.965+0000: 590673: info : qemuMonitorJSONIOProcessLine:208 : QEMU_MONITOR_RECV_REPLY: mon=0x7f2e5c0bd2f0 reply={"return": {"expected-downtime": 27216, "status": "postcopy-paused", "setup-time": 7, "total-time": 112062, "ram": {"total": 17189380096, "postcopy-requests": 1166, "dirty-sync-count": 5, "multifd-bytes": 0, "pages-per-second": 174074, "page-size": 4096, "remaining": 6539677696, "mbps": 3362.3226168224301, "transferred": 34450063130, "duplicate": 3547666, "dirty-pages-rate": 182606, "skipped": 0, "normal-bytes": 34351030272, "normal": 8386482}}, "id": "libvirt-575"}
2023-10-12 15:36:54.984+0000: 590562: info : qemuMonitorSend:861 : QEMU_MONITOR_SEND_MSG: mon=0x7f2e5c0bd2f0 msg={"execute":"getfd","arguments":{"fdname":"migrate"},"id":"libvirt-576"}
2023-10-12 15:36:54.985+0000: 590673: info : qemuMonitorJSONIOProcessLine:208 : QEMU_MONITOR_RECV_REPLY: mon=0x7f2e5c0bd2f0 reply={"return": {}, "id": "libvirt-576"}
2023-10-12 15:36:54.985+0000: 590562: info : qemuMonitorSend:861 : QEMU_MONITOR_SEND_MSG: mon=0x7f2e5c0bd2f0 msg={"execute":"migrate","arguments":{"detach":true,"resume":true,"uri":"fd:migrate"},"id":"libvirt-577"}
2023-10-12 15:36:54.988+0000: 590673: info : qemuMonitorJSONIOProcessLine:203 : QEMU_MONITOR_RECV_EVENT: mon=0x7f2e5c0bd2f0 event={"timestamp": {"seconds": 1697125014, "microseconds": 988571}, "event": "MIGRATION", "data": {"status": "postcopy-recover"}}
2023-10-12 15:36:54.989+0000: 590673: info : qemuMonitorJSONIOProcessLine:208 : QEMU_MONITOR_RECV_REPLY: mon=0x7f2e5c0bd2f0 reply={"return": {}, "id": "libvirt-577"}
2023-10-12 15:36:54.992+0000: 590673: info : qemuMonitorJSONIOProcessLine:203 : QEMU_MONITOR_RECV_EVENT: mon=0x7f2e5c0bd2f0 event={"timestamp": {"seconds": 1697125014, "microseconds": 992171}, "event": "MIGRATION", "data": {"status": "postcopy-active"}}
2023-10-12 15:36:55.452+0000: 590559: info : qemuMonitorSend:861 : QEMU_MONITOR_SEND_MSG: mon=0x7f2e5c0bd2f0 msg={"execute":"query-migrate","id":"libvirt-578"}
2023-10-12 15:36:55.453+0000: 590673: info : qemuMonitorJSONIOProcessLine:208 : QEMU_MONITOR_RECV_REPLY: mon=0x7f2e5c0bd2f0 reply={"return": {"expected-downtime": 29857, "status": "postcopy-active", "setup-time": 7, "total-time": 112550, "ram": {"total": 17189380096, "postcopy-requests": 1649, "dirty-sync-count": 5, "multifd-bytes": 0, "pages-per-second": 165557, "page-size": 4096, "remaining": 6236778496, "mbps": 3064.9029230769233, "transferred": 34626885444, "duplicate": 3581565, "dirty-pages-rate": 182606, "skipped": 0, "normal-bytes": 34527203328, "normal": 8429493}}, "id": "libvirt-578"}
2023-10-12 15:36:55.954+0000: 590563: info : qemuMonitorSend:861 : QEMU_MONITOR_SEND_MSG: mon=0x7f2e5c0bd2f0 msg={"execute":"query-migrate","id":"libvirt-579"}
2023-10-12 15:36:55.954+0000: 590673: info : qemuMonitorJSONIOProcessLine:208 : QEMU_MONITOR_RECV_REPLY: mon=0x7f2e5c0bd2f0 reply={"return": {"expected-downtime": 29714, "status": "postcopy-active", "setup-time": 7, "total-time": 113051, "ram": {"total": 17189380096, "postcopy-requests": 2174, "dirty-sync-count": 5, "multifd-bytes": 0, "pages-per-second": 162815, "page-size": 4096, "remaining": 5893496832, "mbps": 3079.7154174757284, "transferred": 34819658450, "duplicate": 3618483, "dirty-pages-rate": 182606, "skipped": 0, "normal-bytes": 34719268864, "normal": 8476384}}, "id": "libvirt-579"}
2023-10-12 15:36:56.455+0000: 590561: info : qemuMonitorSend:861 : QEMU_MONITOR_SEND_MSG: mon=0x7f2e5c0bd2f0 msg={"execute":"query-migrate","id":"libvirt-580"}
Steps to reproduce
- run VM with heavy RAM load:
nohup stress-ng --vm 6 --vm-bytes 12G &
- run precopy for more that 80sec
- switch into post-copy stage
Additional information
This only occurs with TLS transport, if clear qemu+tcp is used then everything works.