virtio-win / kvm-guest-drivers-windows

Windows paravirtualized drivers for QEMU\KVM
https://www.linux-kvm.org/page/WindowsGuestDrivers
BSD 3-Clause "New" or "Revised" License
2.07k stars 386 forks source link

Windows Server 2019 Validation #721

Open ye-yng opened 2 years ago

ye-yng commented 2 years ago

Hi, I'm currently validating the performance of a Windows Server 2019 VM which is managed by LXD (which uses QEMU).

I have been running performance tests and comparing them with an Ubuntu 20.04 VM. The original post on LXD's discuss page can be found here.

The findings indicated that there is a significant performance decrease when it comes to IO operations when comparing the performance values of Linux vs Windows VMs. I wanted to ask for input/validation from members of the community to see if these values are within the norm.

Following is the graph showing the performance ratio (Windows / Linux VM Performance):

image

viktor-prutyanov commented 2 years ago

Hi, could you please share your Windows VM configuration (libvirt XML or QEMU command-line) ?

ye-yng commented 2 years ago

Hi, following is the LXD generated qemu.conf file:

# Machine
[machine]
graphics = "off"
type = "q35"
accel = "kvm"
usb = "off"

[global]
driver = "ICH9-LPC"
property = "disable_s3"
value = "1"

[global]
driver = "ICH9-LPC"
property = "disable_s4"
value = "1"
[boot-opts]
strict = "on"

# Console
[chardev "console"]
backend = "pty"

# Memory
[memory]
size = "15258M"

# CPU
[smp-opts]
cpus = "12"
sockets = "1"
cores = "12"
threads = "1"

[object "mem0"]

qom-type = "memory-backend-memfd"
size = "15258M"
share = "on"

[numa]
type = "node"
nodeid = "0"
memdev = "mem0"

# Firmware (read only)
[drive]
file = "/snap/lxd/current/share/qemu/OVMF_CODE.fd"
if = "pflash"
format = "raw"
unit = "0"
readonly = "on"

# Firmware settings (writable)
[drive]
file = "/var/snap/lxd/common/lxd/virtual-machines/windows-test/qemu.nvram"
if = "pflash"
format = "raw"
unit = "1"

# Qemu control
[chardev "monitor"]
backend = "socket"
path = "/var/snap/lxd/common/lxd/logs/windows-test/qemu.monitor"
server = "on"
wait = "off"

[mon]
chardev = "monitor"
mode = "control"

[device "qemu_pcie0"]
driver = "pcie-root-port"
bus = "pcie.0"
addr = "1.0"
chassis = "0"
multifunction = "on"

# Balloon driver
[device "qemu_balloon"]
driver = "virtio-balloon-pci"
bus = "qemu_pcie0"
addr = "00.0"

multifunction = "on"

# Random number generator
[object "qemu_rng"]
qom-type = "rng-random"
filename = "/dev/urandom"

[device "dev-qemu_rng"]
driver = "virtio-rng-pci"
bus = "qemu_pcie0"
addr = "00.1"

rng = "qemu_rng"

# Input
[device "qemu_keyboard"]
driver = "virtio-keyboard-pci"
bus = "qemu_pcie0"
addr = "00.2"

# Input
[device "qemu_tablet"]
driver = "virtio-tablet-pci"
bus = "qemu_pcie0"
addr = "00.3"

# Vsock
[device "qemu_vsock"]
driver = "vhost-vsock-pci"
bus = "qemu_pcie0"
addr = "00.4"

guest-cid = "14"

# Virtual serial bus
[device "dev-qemu_serial"]
driver = "virtio-serial-pci"
bus = "qemu_pcie0"
addr = "00.5"

# LXD serial identifier
[chardev "qemu_serial-chardev"]
backend = "ringbuf"
size = "16B"

[device "qemu_serial"]
driver = "virtserialport"
name = "org.linuxcontainers.lxd"
chardev = "qemu_serial-chardev"
bus = "dev-qemu_serial.0"

# Spice agent
[chardev "qemu_spice-chardev"]
backend = "spicevmc"
name = "vdagent"

[device "qemu_spice"]
driver = "virtserialport"
name = "com.redhat.spice.0"
chardev = "qemu_spice-chardev"
bus = "dev-qemu_serial.0"

# Spice folder
[chardev "qemu_spicedir-chardev"]
backend = "spiceport"
name = "org.spice-space.webdav.0"

[device "qemu_spicedir"]
driver = "virtserialport"
name = "org.spice-space.webdav.0"
chardev = "qemu_spicedir-chardev"
bus = "dev-qemu_serial.0"

# USB controller
[device "qemu_usb"]
driver = "qemu-xhci"
bus = "qemu_pcie0"
addr = "00.6"

[chardev "qemu_spice-usb-chardev1"]
  backend = "spicevmc"
  name = "usbredir"

[chardev "qemu_spice-usb-chardev2"]
  backend = "spicevmc"
  name = "usbredir"

[chardev "qemu_spice-usb-chardev3"]
  backend = "spicevmc"
  name = "usbredir"

[device "qemu_spice-usb1"]
  driver = "usb-redir"
  chardev = "qemu_spice-usb-chardev1"

[device "qemu_spice-usb2"]
  driver = "usb-redir"
  chardev = "qemu_spice-usb-chardev2"

[device "qemu_spice-usb3"]
  driver = "usb-redir"
  chardev = "qemu_spice-usb-chardev3"

[device "qemu_pcie1"]
driver = "pcie-root-port"
bus = "pcie.0"
addr = "1.1"
chassis = "1"

# SCSI controller
[device "qemu_scsi"]
driver = "virtio-scsi-pci"
bus = "qemu_pcie1"
addr = "00.0"

[device "qemu_pcie2"]
driver = "pcie-root-port"
bus = "pcie.0"
addr = "1.2"
chassis = "2"

# Config drive (9p)
[fsdev "qemu_config"]
fsdriver = "local"
security_model = "none"
readonly = "on"
path = "/var/snap/lxd/common/lxd/virtual-machines/windows-test/config.mount"

[device "dev-qemu_config-drive-9p"]
driver = "virtio-9p-pci"
bus = "qemu_pcie2"
addr = "00.0"
mount_tag = "config"
fsdev = "qemu_config"
multifunction = "on"

# Config drive (virtio-fs)
[chardev "qemu_config"]
backend = "socket"
path = "/var/snap/lxd/common/lxd/logs/windows-test/virtio-fs.config.sock"

[device "dev-qemu_config-drive-virtio-fs"]
driver = "vhost-user-fs-pci"
bus = "qemu_pcie2"
addr = "00.1"
chardev = "qemu_config"
tag = "config"

[device "qemu_pcie3"]
driver = "pcie-root-port"
bus = "pcie.0"
addr = "1.3"
chassis = "3"

# GPU
[device "qemu_gpu"]
driver = "virtio-vga"
bus = "qemu_pcie3"
addr = "00.0"

[device "qemu_pcie4"]
driver = "pcie-root-port"
bus = "pcie.0"
addr = "1.4"
chassis = "4"

# config drive
[drive "lxd_config"]
file = "/var/snap/lxd/common/lxd/virtual-machines/windows-test/config.iso"
format = "raw"
if = "none"
cache = "none"
aio = "native"
discard = "on"
media = "cdrom"
file.locking = "off"
readonly = "off"

[device "dev-lxd_config"]
driver = "scsi-cd"
bus = "qemu_scsi.0"
channel = "0"
scsi-id = "2"
lun = "1"
drive = "lxd_config"
bootindex = "2"

# root drive
[drive "lxd_root"]
file = "/dev/ubuntu-vg/virtual-machines_windows--test.block"
format = "raw"
if = "none"
cache = "none"
aio = "native"
discard = "on"
media = "disk"
file.locking = "off"
readonly = "off"

[device "dev-lxd_root"]
driver = "scsi-hd"
bus = "qemu_scsi.0"
channel = "0"
scsi-id = "0"
lun = "1"
drive = "lxd_root"
bootindex = "0"

[device "qemu_pcie5"]
driver = "pcie-root-port"
bus = "pcie.0"
addr = "1.5"
chassis = "5"

[device "qemu_pcie6"]
driver = "pcie-root-port"
bus = "pcie.0"
addr = "1.6"
chassis = "6"

[device "qemu_pcie7"]
driver = "pcie-root-port"
bus = "pcie.0"
addr = "1.7"
chassis = "7"

[device "qemu_pcie8"]
driver = "pcie-root-port"
bus = "pcie.0"
addr = "2.0"
chassis = "8"
multifunction = "on"
ye-yng commented 2 years ago

For some extra information, I downloaded WSL on the Windows Server VM to compare some simple dd performance values.

The following commands were executed to test read and write performance:

# Read operation tests
dd if=/root/input of=/dev/null bs="64k" count="400k" iflag=direct

# Write operation tests
dd if=/root/input of=/root/test bs="64k" count="400k"

Following are the average values after 6 runs of each test test:

image

image

It seems that for simple file copying operations the performance overhead on the Windows VM is lower when compared to DB operations.

vrozenfe commented 2 years ago

@ye-yng Do you have hyper-v optimizations enabled? I'm mostly interested in hv_time flag status. Can you please run the following utility https://bugzilla.redhat.com/attachment.cgi?id=1824469 inside of your VM and post the results?

Thanks, Vadim.

ye-yng commented 2 years ago

@vrozenfe It seems I need prior authorization to access the utility:

image

vrozenfe commented 2 years ago

@ye-yng Please check it now? If it still doesn't work then ping me on vrozenfe_at_redhat_dot_com Vadim.

ye-yng commented 2 years ago

image

This seems to be the only output from the utility.

vrozenfe commented 2 years ago

@ye-yng

That's enough for now. Thank you.

It looks as you are using HPET (100MHz) but not hv_time (10 MHz). Most likely this is the reason for the Windows VM under-performing. I have no experience with LDX, but I guess that when it comes to running Windows VMs it is still the same old kvm+qemu. You need to find out how to activate hyper-v enlightenments supported by kvm and qemu. "hv_time" is one of the key features that need to be enabled before running any sort of benchmark tests.

Best, Vadim.

ye-yng commented 2 years ago

Thanks for the information, I'll look into it and report back with any new findings.

ye-yng commented 2 years ago

After finding out how to pass the QEMU configuration options via LXD, I executed the less performant tests again with the following hv_* parameters: hv_time and hv_passthrough

The executable output looks like the following with hv_time and hv_passthrough:

image

Following are the benchmark results:

image

While the pgbench tests had a significant performance increase, the sqlite test seems to underperform by a significant amount in comparison to Linux based QEMU VMs.

Is this the expected result?

vrozenfe commented 2 years ago

@ye-yng virtio-blk and virtio-scsi can be a bit slower on Windows comparing to Linux but probably not as much. I've seen some "fio" results on cnv recently. Windows was almost on pare with Linux in terms of storage performance.

ye-yng commented 2 years ago

@vrozenfe I installed fio on both the Linux and Windows VM and ran some performance tests. These tests were indeed quite close when it comes to storage performance.

It seems that the database benchmarks remain slower in comparison to file IO benchmarks, following are the final results side by side (with and without hv_passthrough flag):

image

The database benchmarks (SQLite and pgbench)had a significant increase in performance with the hv_passthrough flag while other tests had a marginal variation.

I will be installing a new server in the near future and will rerun these tests on it and report back any findings.

peixiu commented 2 years ago

Hi @ye-yng,

I'm trying to reproduce this issue without LXD. Could you help to tell what's the host kernel and the qemu-kvm version are you using? I see you used OVMF, so could help provide ovmf/edk2 version as well? And if possible, could you help to provide the qemu command lines? we can find it with command 'ps aux | grep qemu' on the host where instance running~

Thanks a lot~ Peixiu

peixiu commented 2 years ago

@ye-yng @vrozenfe

Hi, I tried to reproduce this issue on a win2019 vm and a rhel9.0 vm, tested with a SQLite test, the performance on win2019 indeed slow than on the rhel9.0 vm. But I'm not sure if my configuration is same as customer(@ye-yng ), My test details as follows:

1. For win2019, the sqlite results as follows link:**** https://openbenchmarking.org/result/2203285-NE-PEIXIUSQL33

Screenshot from 2022-03-31 05-51-15

Screenshot from 2022-03-31 05-33-27

2. For rhel9.0, the sqlite results as follows: Screenshot from 2022-03-31 05-48-53

Screenshot from 2022-03-31 05-53-11

We can see there are large difference between the spent seconds of windows and linux with the same test. Let's diff the 2 results data details, sorry I have not found the diff function on PTS tool, so I manually diff as:

Win2019 performance/RHEL9.0 performance: Screenshot from 2022-03-31 06-19-50

Used qemu commands: /usr/libexec/qemu-kvm \ -name 'avocado-vt-vm3' \ -machine q35 \ -nodefaults \ -vga std \ -device pcie-root-port,port=0x10,chassis=1,id=pci.1,bus=pcie.0,multifunction=on,addr=0x3 \ -device pcie-root-port,port=0x11,chassis=2,id=pci.2,bus=pcie.0,addr=0x3.0x1 \ -device pcie-root-port,port=0x12,chassis=3,id=pci.3,bus=pcie.0,addr=0x3.0x2 \ -device pcie-root-port,port=0x13,chassis=4,id=pci.4,bus=pcie.0,addr=0x3.0x3 \ -device pcie-root-port,port=0x14,chassis=5,id=pci.5,bus=pcie.0,addr=0x3.0x4 \ -device pcie-root-port,port=0x15,chassis=6,id=pci.6,bus=pcie.0,addr=0x3.0x5 \ -device pcie-root-port,port=0x16,chassis=7,id=pci.7,bus=pcie.0,addr=0x3.0x6 \ -device pcie-root-port,port=0x17,chassis=8,id=pci.8,bus=pcie.0,addr=0x3.0x7 \ -device virtio-scsi-pci,id=scsi0,bus=pci.3,addr=0x0 \ -blockdev driver=file,filename=/home/kvm_autotest_root/images/win2019-64-virtio-scsi.qcow2,node-name=libvirt-3-storage,cache.direct=on,cache.no-flush=off,auto-read-only=on,discard=unmap \ -blockdev node-name=libvirt-3-format,read-only=off,discard=unmap,detect-zeroes=unmap,cache.direct=on,cache.no-flush=off,driver=qcow2,file=libvirt-3-storage \ -device scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,device_id=drive-scsi0-0-0-0,drive=libvirt-3-format,id=scsi0-0-0-0,bootindex=0,write-cache=on \ -device virtio-net-pci,mac=9a:36:83:b6:3d:05,id=idJVpmsF,netdev=id23ZUK6,bus=pci.3 \ -netdev tap,id=id23ZUK6,vhost=on \ -m 16384 \ -smp 12,maxcpus=12 \ -cpu host,hv_stimer,hv_synic,hv_vpindex,hv_reset,hv_relaxed,hv_spinlocks=0x1fff,hv_vapic,hv_time,hv-tlbflush,+kvm_pv_unhalt,hv_passthrough \ -cdrom /home/WindowsServer2012Essentials-ADK.iso \ -device piix3-usb-uhci,id=usb -device usb-tablet,id=input0 \ -vnc :6 \ -rtc base=localtime,clock=host,driftfix=slew \ -boot order=cdn,once=c,menu=off,strict=off \ -enable-kvm \ -qmp tcp:0:4445,server,nowait \ -monitor stdio \

Run SQLite test command: phoronix-test-suite benchmark sqlite

Hi @ye-yng ,

Could you help to check my steps and configuration? I'm not sure if that's same with yours, thanks~ And if possible, I also want to see you used qemu commands as I mentioned on last comment.

Hi @vrozenfe Also please help to check the result, only system disk on both vm, win2019 with 45G disk size, rhel9.0 with 20G disk size, the performance is large different.

Thanks a lot for you~ Peixiu

ye-yng commented 2 years ago

HI @peixiu , thanks for updating with test results!

As I run qemu through LXD I am not very familiar with it. If you could provide some instructions to check the ovmf/edk2 versions that would be great.

Unfortunately the server I used for testing is currently off the network, but I'll report with the kernel and ovmf/edk2 version as soon as it is functioning again.

Meanwhile I have another server running LXD with a similar Windows Server 19 VM configuration running with the following qemu-kvm command:

/snap/lxd/x1/bin/qemu-system-x86_64 -S 
-name windows-server 
-uuid d3614689-c5c5-4470-922b-baa398250b54
-daemonize 
-cpu host
-nographic
-serial chardev:console
-nodefaults
-no-user-config
-sandbox on,obsolete=deny,elevateprivileges=allow,spawn=deny,resourcecontrol=deny
-readconfig /var/snap/lxd/common/lxd/logs/windows-server/qemu.conf
-spice unix=on,disable-ticketing=on,addr=/var/snap/lxd/common/lxd/logs/windows-server/qemu.spice
-pidfile /var/snap/lxd/common/lxd/logs/windows-server/qemu.pid
-D /var/snap/lxd/common/lxd/logs/windows-server/qemu.log 
-smbios type=2,manufacturer=Canonical Ltd.,product=LXD
-runas lxd
-cpu host,hv_passthrough,
-vmx
peixiu commented 2 years ago

Hi All,

I tried to test this case with a bare metal system, installed a Win2019 os, hit the similar situation:, follows are SQLite test result on win2019 os which is installed on bare metal system. https://openbenchmarking.org/result/2205102-NE-BERMETAL262

Screenshot from 2022-05-10 14-35-55

The bare metal system with 48 Logical processors, 64G memory, and I set the system disk size is 50G. The VM with 12 vcpus and 16G memory, and the system disk size also is 50G. The performance of SQLite benchmark test VM/baremetal is about 81%. althrough they have different cpu and memory, I think it's can be acceptable.

For the diff between windows vm and linux vm, I'm not sure the detail reason, I just found SQLite tool is developed base on diff VFS, eg unix VFS and windows VFS, Reference doc as follows: https://www.sqlite.org/vfs.html

And BTW, the SQLite tool seems not a block performance tool, it's focus on SQL, the testing content we can refer to this doc: https://www.sqlite.org/testing.html

Thanks~ Peixiu

JonKohler commented 2 years ago

@peixiu - in your win 2019 VM, can you run cpuprint.exe and post the output, specifically of the CPUID 0x40000004, 0x00000000: Hypervisor Enlightenment Implementation Recommendations section. I see the bits you posted from your qemu command line, just want to validate how it actually shows up in the guest.

https://skanthak.homepage.t-online.de/cpuid.html

peixiu commented 2 years ago

@peixiu - in your win 2019 VM, can you run cpuprint.exe and post the output, specifically of the CPUID 0x40000004, 0x00000000: Hypervisor Enlightenment Implementation Recommendations section. I see the bits you posted from your qemu command line, just want to validate how it actually shows up in the guest.

https://skanthak.homepage.t-online.de/cpuid.html

Hi @JonKohler I tried to download cpuprint.exe from you provide link, but the program was report contained a virus, so was blocked: Screenshot from 2022-05-11 11-52-46

And I also did not find another tool to replace this one on windows. I tried to get the cpuid info with a linux vm, booted with the same cpu flags as win2019 vm , -cpu host,hv_stimer,hv_synic,hv_vpindex,hv_reset,hv_relaxed,hv_spinlocks=0x1fff,hv_vapic,hv_time,hv-tlbflush,+kvm_pv_unhalt,hv_passthrough \

I'm not sure if it's ok, details as follows:

CPU 0: vendor_id = "AuthenticAMD" version information (1/eax): processor type = primary processor (0) family = Intel Pentium 4/Pentium D/Pentium Extreme Edition/Celeron/Xeon/Xeon MP/Itanium2, AMD Athlon 64/Athlon XP-M/Opteron/Sempron/Turion (15) model = 0x1 (1) stepping id = 0x2 (2) extended family = 0x8 (8) extended model = 0x0 (0) (simple synth) = unknown miscellaneous (1/ebx): process local APIC physical ID = 0x0 (0) cpu count = 0x2 (2) CLFLUSH line size = 0x8 (8) brand index = 0x0 (0) brand id = 0x00 (0): unknown feature information (1/edx): x87 FPU on chip = true virtual-8086 mode enhancement = true debugging extensions = true page size extensions = true time stamp counter = true RDMSR and WRMSR support = true physical address extensions = true machine check exception = true CMPXCHG8B inst. = true APIC on chip = true SYSENTER and SYSEXIT = true memory type range registers = true PTE global bit = true machine check architecture = true conditional move/compare instruction = true page attribute table = true page size extension = true processor serial number = false CLFLUSH instruction = true debug store = false thermal monitor and clock ctrl = false MMX Technology = true FXSAVE/FXRSTOR = true SSE extensions = true SSE2 extensions = true self snoop = false hyper-threading / multi-core supported = true therm. monitor = false IA64 = false pending break event = false feature information (1/ecx): PNI/SSE3: Prescott New Instructions = true PCLMULDQ instruction = true 64-bit debug store = false MONITOR/MWAIT = false CPL-qualified debug store = false VMX: virtual machine extensions = false SMX: safer mode extensions = false Enhanced Intel SpeedStep Technology = false thermal monitor 2 = false SSSE3 extensions = true context ID: adaptive or shared L1 data = false FMA instruction = true CMPXCHG16B instruction = true xTPR disable = false perfmon and debug = false process context identifiers = false direct cache access = false SSE4.1 extensions = true SSE4.2 extensions = true extended xAPIC support = true MOVBE instruction = true POPCNT instruction = true time stamp counter deadline = true AES instruction = true XSAVE/XSTOR states = true OS-enabled XSAVE/XSTOR = true AVX: advanced vector extensions = true F16C half-precision convert instruction = true RDRAND instruction = true hypervisor guest status = true cache and TLB information (2): processor serial number: 0080-0F12-0000-0000-0000-0000 MONITOR/MWAIT (5): smallest monitor-line size (bytes) = 0x0 (0) largest monitor-line size (bytes) = 0x0 (0) enum of Monitor-MWAIT exts supported = true supports intrs as break-event for MWAIT = true number of C0 sub C-states using MWAIT = 0x0 (0) number of C1 sub C-states using MWAIT = 0x0 (0) number of C2 sub C-states using MWAIT = 0x0 (0) number of C3 sub C-states using MWAIT = 0x0 (0) number of C4 sub C-states using MWAIT = 0x0 (0) number of C5 sub C-states using MWAIT = 0x0 (0) number of C6 sub C-states using MWAIT = 0x0 (0) number of C7 sub C-states using MWAIT = 0x0 (0) Thermal and Power Management Features (6): digital thermometer = false Intel Turbo Boost Technology = false ARAT always running APIC timer = true PLN power limit notification = false ECMD extended clock modulation duty = false PTM package thermal management = false HWP base registers = false HWP notification = false HWP activity window = false HWP energy performance preference = false HWP package level request = false HDC base registers = false digital thermometer thresholds = 0x0 (0) ACNT/MCNT supported performance measure = false ACNT2 available = false performance-energy bias capability = false extended feature flags (7): FSGSBASE instructions = true IA32_TSC_ADJUST MSR supported = true SGX: Software Guard Extensions supported = false BMI instruction = true HLE hardware lock elision = false AVX2: advanced vector extensions 2 = true FDP_EXCPTN_ONLY = false SMEP supervisor mode exec protection = true BMI2 instructions = true enhanced REP MOVSB/STOSB = false INVPCID instruction = false RTM: restricted transactional memory = false QM: quality of service monitoring = false deprecated FPU CS/DS = false intel memory protection extensions = false PQE: platform quality of service enforce = false AVX512F: AVX-512 foundation instructions = false AVX512DQ: double & quadword instructions = false RDSEED instruction = true ADX instructions = true SMAP: supervisor mode access prevention = true AVX512IFMA: fused multiply add = false CLFLUSHOPT instruction = true CLWB instruction = false Intel processor trace = false AVX512PF: prefetch instructions = false AVX512ER: exponent & reciprocal instrs = false AVX512CD: conflict detection instrs = false SHA instructions = true AVX512BW: byte & word instructions = false AVX512VL: vector length = false PREFETCHWT1 = false AVX512VBMI: vector byte manipulation = false UMIP: user-mode instruction prevention = false PKU protection keys for user-mode = false OSPKE CR4.PKE and RDPKRU/WRPKRU = false BNDLDX/BNDSTX MAWAU value in 64-bit mode = 0x0 (0) RDPID: read processor D supported = false SGX_LC: SGX launch config supported = false AVX512_4VNNIW: neural network instrs = false AVX512_4FMAPS: multiply acc single prec = false Direct Cache Access Parameters (9): PLATFORM_DCA_CAP MSR bits = 0 Architecture Performance Monitoring Features (0xa/eax): version ID = 0x0 (0) number of counters per logical processor = 0x0 (0) bit width of counter = 0x0 (0) length of EBX bit vector = 0x0 (0) Architecture Performance Monitoring Features (0xa/ebx): core cycle event not available = false instruction retired event not available = false reference cycles event not available = false last-level cache ref event not available = false last-level cache miss event not avail = false branch inst retired event not available = false branch mispred retired event not avail = false Architecture Performance Monitoring Features (0xa/edx): number of fixed counters = 0x0 (0) bit width of fixed counters = 0x0 (0) x2APIC features / processor topology (0xb): --- level 0 (thread) --- bits to shift APIC ID to get next = 0x0 (0) logical processors at this level = 0x1 (1) level number = 0x0 (0) level type = thread (1) extended APIC ID = 0 --- level 1 (core) --- bits to shift APIC ID to get next = 0x1 (1) logical processors at this level = 0x2 (2) level number = 0x1 (1) level type = core (2) extended APIC ID = 0 XSAVE features (0xd/0): XCR0 lower 32 bits valid bit field mask = 0x00000007 XCR0 upper 32 bits valid bit field mask = 0x00000000 XCR0 supported: x87 state = true XCR0 supported: SSE state = true XCR0 supported: AVX state = true XCR0 supported: MPX BNDREGS = false XCR0 supported: MPX BNDCSR = false XCR0 supported: AVX-512 opmask = false XCR0 supported: AVX-512 ZMM_Hi256 = false XCR0 supported: AVX-512 Hi16_ZMM = false IA32_XSS supported: PT state = false XCR0 supported: PKRU state = false bytes required by fields in XCR0 = 0x00000340 (832) bytes required by XSAVE/XRSTOR area = 0x00000340 (832) XSAVE features (0xd/1): XSAVEOPT instruction = true XSAVEC instruction = true XGETBV instruction = true XSAVES/XRSTORS instructions = true SAVE area size in bytes = 0x00000340 (832) IA32_XSS lower 32 bits valid bit field mask = 0x00000000 IA32_XSS upper 32 bits valid bit field mask = 0x00000000 AVX/YMM features (0xd/2): AVX/YMM save state byte size = 0x00000100 (256) AVX/YMM save state byte offset = 0x00000240 (576) supported in IA32_XSS or XCR0 = XCR0 (user state) 64-byte alignment in compacted XSAVE = false hypervisor_id = "Linux KVM Hv" 0x40000001 0x00: eax=0x31237648 ebx=0x00000000 ecx=0x00000000 edx=0x00000000 0x40000002 0x00: eax=0x00003839 ebx=0x000a0000 ecx=0x00000000 edx=0x00000000 0x40000003 0x00: eax=0x00002aff ebx=0x00000030 ecx=0x00000000 edx=0x00080508 0x40000004 0x00: eax=0x00000e24 ebx=0x00000fff ecx=0x00000000 edx=0x00000000 0x40000005 0x00: eax=0x00000400 ebx=0x00000040 ecx=0x00000000 edx=0x00000000 extended processor signature (0x80000001/eax): family/generation = AMD Athlon 64/Opteron/Sempron/Turion (15) model = 0x1 (1) stepping id = 0x2 (2) extended family = 0x8 (8) extended model = 0x0 (0) (simple synth) = unknown extended feature flags (0x80000001/edx): x87 FPU on chip = true virtual-8086 mode enhancement = true debugging extensions = true page size extensions = true time stamp counter = true RDMSR and WRMSR support = true physical address extensions = true machine check exception = true CMPXCHG8B inst. = true APIC on chip = true SYSCALL and SYSRET instructions = true memory type range registers = true global paging extension = true machine check architecture = true conditional move/compare instruction = true page attribute table = true page size extension = true multiprocessing capable = false no-execute page protection = true AMD multimedia instruction extensions = true MMX Technology = true FXSAVE/FXRSTOR = true SSE extensions = true 1-GB large page support = true RDTSCP = true long mode (AA-64) = true 3DNow! instruction extensions = false 3DNow! instructions = false extended brand id (0x80000001/ebx): raw = 0x0 (0) BrandId = 0x0 (0) AMD feature flags (0x80000001/ecx): LAHF/SAHF supported in 64-bit mode = true CMP Legacy = true SVM: secure virtual machine = true extended APIC space = false AltMovCr8 = true LZCNT advanced bit manipulation = true SSE4A support = true misaligned SSE mode = true 3DNow! PREFETCH/PREFETCHW instructions = true OS visible workaround = true instruction based sampling = false XOP support = false SKINIT/STGI support = false watchdog timer support = false lightweight profiling support = false 4-operand FMA instruction = false NodeId MSR C001100C = false TBM support = false topology extensions = false brand = "AMD EPYC 7301 16-Core Processor " L1 TLB/cache information: 2M/4M pages & L1 TLB (0x80000005/eax): instruction # entries = 0xff (255) instruction associativity = 0x1 (1) data # entries = 0xff (255) data associativity = 0x1 (1) L1 TLB/cache information: 4K pages & L1 TLB (0x80000005/ebx): instruction # entries = 0xff (255) instruction associativity = 0x1 (1) data # entries = 0xff (255) data associativity = 0x1 (1) L1 data cache information (0x80000005/ecx): line size (bytes) = 0x40 (64) lines per tag = 0x1 (1) associativity = 0x2 (2) size (KB) = 0x40 (64) L1 instruction cache information (0x80000005/edx): line size (bytes) = 0x40 (64) lines per tag = 0x1 (1) associativity = 0x2 (2) size (KB) = 0x40 (64) L2 TLB/cache information: 2M/4M pages & L2 TLB (0x80000006/eax): instruction # entries = 0x0 (0) instruction associativity = L2 off (0) data # entries = 0x0 (0) data associativity = L2 off (0) L2 TLB/cache information: 4K pages & L2 TLB (0x80000006/ebx): instruction # entries = 0x200 (512) instruction associativity = 4-way (4) data # entries = 0x200 (512) data associativity = 4-way (4) L2 unified cache information (0x80000006/ecx): line size (bytes) = 0x40 (64) lines per tag = 0x1 (1) associativity = 16-way (8) size (KB) = 0x200 (512) L3 cache information (0x80000006/edx): line size (bytes) = 0x40 (64) lines per tag = 0x1 (1) associativity = 16-way (8) size (in 512KB units) = 0x20 (32) Advanced Power Management Features (0x80000007/edx): temperature sensing diode = false frequency ID (FID) control = false voltage ID (VID) control = false thermal trip (TTP) = false thermal monitor (TM) = false software thermal control (STC) = false 100 MHz multiplier control = false hardware P-State control = false TscInvariant = false Physical Address and Linear Address Size (0x80000008/eax): maximum physical address bits = 0x30 (48) maximum linear (virtual) address bits = 0x30 (48) maximum guest physical address bits = 0x0 (0) Logical CPU cores (0x80000008/ecx): number of CPU cores - 1 = 0x1 (1) ApicIdCoreIdSize = 0x1 (1) SVM Secure Virtual Machine (0x8000000a/eax): SvmRev: SVM revision = 0x1 (1) SVM Secure Virtual Machine (0x8000000a/edx): nested paging = true LBR virtualization = false SVM lock = false NRIP save = true MSR based TSC rate control = false VMCB clean bits support = false flush by ASID = false decode assists = false SSSE3/SSE5 opcode set disable = false pause intercept filter = false pause filter threshold = false NASID: number of address space identifiers = 0x10 (16): L1 TLB information: 1G pages (0x80000019/eax): instruction # entries = 0x0 (0) instruction associativity = L2 off (0) data # entries = 0x0 (0) data associativity = L2 off (0) L2 TLB information: 1G pages (0x80000019/ebx): instruction # entries = 0x0 (0) instruction associativity = L2 off (0) data # entries = 0x0 (0) data associativity = L2 off (0) SVM Secure Virtual Machine (0x8000001a/eax): 128-bit SSE executed full-width = false MOVU better than MOVL/MOVH* = false Instruction Based Sampling Identifiers (0x8000001b/eax): IBS feature flags valid = false IBS fetch sampling = false IBS execution sampling = false read write of op counter = false op counting mode = false branch target address reporting = false IbsOpCurCnt and IbsOpMaxCnt extend 7 = false invalid RIP indication supported = false Lightweight Profiling Capabilities: Availability (0x8000001c/eax): lightweight profiling = false LWPVAL instruction = false instruction retired event = false branch retired event = false DC miss event = false core clocks not halted event = false core reference clocks not halted event = false interrupt on threshold overflow = false Lightweight Profiling Capabilities: Supported (0x8000001c/edx): lightweight profiling = false LWPVAL instruction = false instruction retired event = false branch retired event = false DC miss event = false core clocks not halted event = false core reference clocks not halted event = false interrupt on threshold overflow = false Lightweight Profiling Capabilities (0x8000001c/ebx): LWPCB byte size = 0x0 (0) event record byte size = 0x0 (0) maximum EventId = 0x0 (0) EventInterval1 field offset = 0x0 (0) Lightweight Profiling Capabilities (0x8000001c/ecx): latency counter bit size = 0x0 (0) data cache miss address valid = false amount cache latency is rounded = 0x0 (0) LWP implementation version = 0x0 (0) event ring buffer size in records = 0x0 (0) branch prediction filtering = false IP filtering = false cache level filtering = false cache latency filteing = false Cache Properties (0x8000001d): --- cache 0 --- type = data (1) level = 0x1 (1) self-initializing = true fully associative = false extra cores sharing this cache = 0x0 (0) line size in bytes = 0x3f (63) physical line partitions = 0x0 (0) number of ways = 0x1 (1) number of sets = 511 write-back invalidate = true cache inclusive of lower levels = false --- cache 1 --- type = instruction (2) level = 0x1 (1) self-initializing = true fully associative = false extra cores sharing this cache = 0x0 (0) line size in bytes = 0x3f (63) physical line partitions = 0x0 (0) number of ways = 0x1 (1) number of sets = 511 write-back invalidate = true cache inclusive of lower levels = false --- cache 2 --- type = unified (3) level = 0x2 (2) self-initializing = false fully associative = false extra cores sharing this cache = 0x0 (0) line size in bytes = 0x3f (63) physical line partitions = 0x0 (0) number of ways = 0xf (15) number of sets = 511 write-back invalidate = false cache inclusive of lower levels = false --- cache 3 --- type = unified (3) level = 0x3 (3) self-initializing = true fully associative = false extra cores sharing this cache = 0x1 (1) line size in bytes = 0x3f (63) physical line partitions = 0x0 (0) number of ways = 0xf (15) number of sets = 16383 write-back invalidate = false cache inclusive of lower levels = true extended APIC ID = 0 Extended APIC ID (0x8000001e/ebx): compute unit ID = 0x0 (0) cores per compute unit - 1 = 0x0 (0) Extended APIC ID (0x8000001e/ecx): node ID = 0x0 (0) nodes per processor = 1 node (0) 0x8000001f 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000 (instruction supported synth): CMPXCHG8B = true conditional move/compare = true PREFETCH/PREFETCHW = true (multi-processing synth): multi-core (c=2) (multi-processing method): AMD (APIC widths synth): CORE_width=1 SMT_width=0 (APIC synth): PKG_ID=0 CORE_ID=0 SMT_ID=0 (synth) = unknown

Best Regards~ Peixiu