Open AlexMKX opened 6 months ago
@AlexMKX Thank you for reporting this issue. Can you please post the qemu command line and if possible to share the crash dump file?
Thanks, Vadim.
@vrozenfe thanks for taking this into account. I have sent dump via email. The command line is :
/usr/bin/kvm -id 137 -name stor,debug-threads=on -no-shutdown -chardev socket,id=qmp,path=/var/run/qemu-server/137.qmp,server=on,wait=off -mon chardev=qmp,mode=control -chardev socket,id=qmp-event,path=/var/run/qmeventd.sock,reconnect=5 -mon chardev=qmp-event,mode=control -pidfile /var/run/qemu-server/137.pid -daemonize -smbios type=1,uuid=52534755-c968-48b9-b6f1-8012fce39718
-drive if=pflash,unit=0,format=raw,readonly=on,file=/usr/share/pve-edk2-firmware//OVMF_CODE_4M.secboot.fd -drive if=pflash,unit=1,id=drive-efidisk0,format=raw,file=/dev/zvol/infra/vm-137
-disk-0,size=540672
-smp 4,sockets=2,cores=2,maxcpus=4 -nodefaults -boot menu=on,strict=on,reboot-timeout=1000,splash=/usr/share/qemu-server/bootsplash.jpg
-vnc unix:/var/run/qemu-server/137.vnc,password=on -cpu host,hv_ipi,hv_relaxed,hv_reset,hv_runtime,hv_spinlocks=0x1fff,hv_stimer,hv_synic,hv_time,hv_vapic,hv_vpindex,+kvm_pv_eoi,+kvm_pv_unhalt -m 82192
-object memory-backend-ram,id=ram-node0,size=41096M -numa node,nodeid=0,cpus=0-1,memdev=ram-node0 -object memory-backend-ram,id=ram-node1,size=41096M
-numa node,nodeid=1,cpus=2-3,memdev=ram-node1 -object iothread,id=iothread-virtioscsi0
-object iothread,id=iothread-virtioscsi1
-object iothread,id=iothread-virtioscsi3 -readconfig /usr/share/qemu-server/pve-q35-4.0.cfg
-device vmgenid,guid=96f37124-bc3c-4142-a49d-7d280cacdc8b -device usb-tablet,id=tablet,bus=ehci.0,port=1
-chardev socket,id=tpmchar,path=/var/run/qemu-server/137.swtpm -tpmdev emulator,id=tpmdev,chardev=tpmchar
-device tpm-tis,tpmdev=tpmdev -device VGA,id=vga,bus=pcie.0,addr=0x1
-chardev socket,path=/var/run/qemu-server/137.qga,server=on,wait=off,id=qga0 -device virtio-serial,id=qga0,bus=pci.0,addr=0x8 -device virtserialport,chardev=qga0,name=org.qemu.guest_agent.0
-device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x3,free-page-reporting=on -iscsi initiator-name=iqn.1993-08.org.debian:01:56e748a7f695 -drive if=none,id=drive-ide0,media=cdrom,aio=io_uring
-device ide-cd,bus=ide.0,unit=0,drive=drive-ide0,id=ide0,bootindex=102
-device virtio-scsi-pci,id=virtioscsi0,bus=pci.3,addr=0x1,iothread=iothread-virtioscsi0
-drive file=/dev/zvol/storage/vm-137-disk-0,if=none,id=drive-scsi0,discard=on,throttling.bps-write=10485760,throttling.bps-write-max=104857600,format=raw,cache=none,aio=io_uring,detect-zeroes=unmap
-device scsi-hd,bus=virtioscsi0.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0,id=scsi0
-device virtio-scsi-pci,id=virtioscsi1,bus=pci.3,addr=0x2,iothread=iothread-virtioscsi1
-drive file=/dev/zvol/infra/vm-137-disk-3,if=none,id=drive-scsi1,cache=writeback,discard=on,format=raw,aio=io_uring,detect-zeroes=unmap -device scsi-hd,bus=virtioscsi1.0,channel=0,scsi-id=0,lun=1,drive=drive-scsi1,id=scsi1,rotation_rate=1,bootindex=100
-device virtio-scsi-pci,id=virtioscsi3,bus=pci.3,addr=0x4,iothread=iothread-virtioscsi3
-drive file=/dev/disk/by-id/wwn-0x50014ee2139969e3,if=none,id=drive-scsi3,format=raw,cache=none,aio=io_uring,detect-zeroes=on -device scsi-hd,bus=virtioscsi3.0,channel=0,scsi-id=0,lun=3,drive=drive-scsi3,id=scsi3
-netdev type=tap,id=net0,ifname=tap137i0,script=/var/lib/qemu-server/pve-bridge,downscript=/var/lib/qemu-server/pve-bridgedown
-device e1000,mac=32:1E:10:7E:AE:E4,netdev=net0,bus=pci.0,addr=0x12,id=net0,bootindex=101 -rtc driftfix=slew,base=localtime -machine hpet=off,type=pc-q35-8.1+pve0
-global kvm-pit.lost_tick_policy=discard
This is current command line. Kindly note that during that BSOD there was no write bandwidth capping for the scsi0
I'm running the ECC Reg Supermicro X8DTU machine with another VMs, so I believe this is not caused by the host-memory errors.
Some updates on the issue. During the issue-related operations, the virtual disk resided on the ZFS encrypted dataaset with checksums, without compression and with dedup turned on. This dataset resided on the spinning disk.
I've removed that dataset and passed this disk directly to the VM, created partition and encrypted it with VeraCrypt. At this moment ~1.2 TB has copied without any problems. The drive passed as Virtio SCSI Single.
Hello. Kindly find updates below. Copying 2TB to the storage on ZFS without dedup went flawlessly. Copying same amount of data to the storage on ZFS with dedup=skein,verify ended up with Device Not Ready (no BSOD yet though). There are Reset to device, \Device\RaidPort1, was issued. in the event logs. Computer\HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\disk:TimeOutValue = 60 Computer\HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\disk:IoTimeoutValue = 60 Computer\HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\vioscsi\Parameters:IoTimeoutValue = 90
I'm going to cap the throughtput for the target device at qemu, to spread and lower the write load to ZFS.
20mbps bandwidth cap for volume with dedup has eneded up with CRITICAL_PROCESS_DIED for svcshost after 800gb copy It seems, the problem is most likely caused by the underlying storage latency issues.
At 15MBps bandwidth cap got device not ready error.
Hello. I have reduced RAM for the VM to 4096, now it fails faster. It seems, I've done with differential tests and waiting for the inputs from you :) With 2048 RAM I'm getting stable BSODs at ~100GB copied. Bsods are different - not only hash mismatch, they are just random.
Describe the bug There is a Windows 2022 server virtual machine with physically connected 6TB drive and 6TB drive laying on the ZFS HDD partition. While copying the data from physical drive to the virtual drive the BSOD happens anywhere between 100 and 200 GBs transferred. It happens with target drive is connected either with Virtio SCSI, Virtio SCSI Single, Virtio block. When target drive is connected with SATA the data are copied perfectly. As an addition the Optimize-Volume -Retrim for that SCSI connected disk requires about 80GB+ of RAM available. Though trimming same drive connected via SATA causes no problems.
To Reproduce Steps to reproduce the behaviour: Copy from one disk to another the relative big amount of data.
Host:
VM:
Additional context
The usual minidump analyzis:
What tried to solve: