rust-lang / rust

Empowering everyone to build reliable and efficient software.
https://www.rust-lang.org
Other
98.54k stars 12.74k forks source link

repr(simd) does not align to Intel recs on x86_64 #81931

Open workingjubilee opened 3 years ago

workingjubilee commented 3 years ago

I tried this code: (Playground). The random inputs are mostly just to keep the compiler fairly "honest" and block optimizing away the instructions it would use.

#![feature(repr_simd)]
#![feature(platform_intrinsics)]
use rand::random;

#[derive(Debug)]
#[repr(simd)]
struct f32x2(f32, f32);

extern "platform-intrinsic" {
    fn simd_shuffle2<T, U>(a: T, b: T, idx: [u32; 2]) -> U;
}

fn main() {
    let x = f32x2(rand::random(), rand::random());
    let y = f32x2(rand::random(), rand::random());
    let z: f32x2 = unsafe { simd_shuffle2(x, y, [0, 2]) };
    println!("Alignment is: {:?}", std::mem::align_of::<f32x2>());
    println!("Data is: {:?}", z);
}

For best performance, the Streaming SIMD Extensions and Streaming SIMD Extensions 2 require their memory operands to be aligned to 16-byte boundaries.

Thus, I expected to see this happen:

Alignment is: 16
Data is: f32x2(0.12946808, 0.4856578)

Instead, this happened:

Alignment is: 8
Data is: f32x2(0.12946808, 0.4856578)

That does not appear to be the correct alignment to report for this type, unless I am misunderstanding something here.

Meta

rustc --version --verbose:

rustc 1.52.0-nightly (0fc6756b4 2021-02-08)
binary: rustc
commit-hash: 0fc6756b42e0556cc2e18079f5fc6b4d58f4e81a
commit-date: 2021-02-08
host: x86_64-unknown-linux-gnu
release: 1.52.0-nightly
LLVM version: 11.0.1

I believe this is related to, but not exactly the same as, #27060. Apologies if this is a total duplicate, or if I am misunderstanding something here about what Rust means by "alignment", but after careful review with @calebzulawski, we started to arrive at the conclusion that something was off.

Here is the generated assembly, as you can see, it uses multiple SSE instructions, including movaps, an aligned load, but I haven't exhaustively analyzed it so I can't immediately tell if actual alignment requirements are being adhered to here and I am just spooked by the seemingly misleading information.

x86_64 Assembly ```asm std::sys_common::backtrace::__rust_begin_short_backtrace: # @std::sys_common::backtrace::__rust_begin_short_backtrace # %bb.0: sub rsp, 8 call rdi mov rax, rsp #APP #NO_APP pop rax ret # -- End function std::rt::lang_start: # @std::rt::lang_start # %bb.0: sub rsp, 8 mov rcx, rdx mov rdx, rsi mov qword ptr [rsp], rdi lea rsi, [rip + .L__unnamed_1] mov rdi, rsp call qword ptr [rip + std::rt::lang_start_internal@GOTPCREL] pop rcx ret # -- End function std::rt::lang_start::{{closure}}: # @"std::rt::lang_start::{{closure}}" # %bb.0: sub rsp, 8 mov rdi, qword ptr [rdi] call std::sys_common::backtrace::__rust_begin_short_backtrace xor eax, eax pop rcx ret # -- End function <&T as core::fmt::Debug>::fmt: # @"<&T as core::fmt::Debug>::fmt" # %bb.0: mov rdi, qword ptr [rdi] jmp qword ptr [rip + core::fmt::float::::fmt@GOTPCREL] # TAILCALL # -- End function core::fmt::num::::fmt: # @"core::fmt::num::::fmt" # %bb.0: push r14 push rbx sub rsp, 8 mov rbx, rsi mov r14, rdi mov rdi, rsi call qword ptr [rip + core::fmt::Formatter::debug_lower_hex@GOTPCREL] test al, al je .LBB4_1 # %bb.3: mov rdi, r14 mov rsi, rbx add rsp, 8 pop rbx pop r14 jmp qword ptr [rip + core::fmt::num::::fmt@GOTPCREL] # TAILCALL .LBB4_1: mov rdi, rbx call qword ptr [rip + core::fmt::Formatter::debug_upper_hex@GOTPCREL] mov rdi, r14 mov rsi, rbx add rsp, 8 test al, al je .LBB4_2 # %bb.4: pop rbx pop r14 jmp qword ptr [rip + core::fmt::num::::fmt@GOTPCREL] # TAILCALL .LBB4_2: pop rbx pop r14 jmp qword ptr [rip + core::fmt::num::imp::::fmt@GOTPCREL] # TAILCALL # -- End function core::ops::function::FnOnce::call_once{{vtable.shim}}: # @"core::ops::function::FnOnce::call_once{{vtable.shim}}" # %bb.0: sub rsp, 8 mov rdi, qword ptr [rdi] call std::sys_common::backtrace::__rust_begin_short_backtrace xor eax, eax pop rcx ret # -- End function core::ptr::drop_in_place<&f32>: # @"core::ptr::drop_in_place<&f32>" # %bb.0: ret # -- End function core::ptr::drop_in_place: # @"core::ptr::drop_in_place" # %bb.0: mov rax, qword ptr [rdi] add qword ptr [rax], -1 mov rax, qword ptr [rdi] cmp qword ptr [rax], 0 jne .LBB7_2 # %bb.1: add qword ptr [rax + 8], -1 mov rdi, qword ptr [rdi] cmp qword ptr [rdi + 8], 0 je .LBB7_3 .LBB7_2: ret .LBB7_3: mov esi, 368 mov edx, 16 jmp qword ptr [rip + __rust_dealloc@GOTPCREL] # TAILCALL # -- End function rand::rngs::adapter::reseeding::ReseedingCore::reseed_and_generate: # @"rand::rngs::adapter::reseeding::ReseedingCore::reseed_and_generate" # %bb.0: push r15 push r14 push r13 push r12 push rbx sub rsp, 160 mov r15, rdx mov r14, rsi mov rbx, rdi xorps xmm0, xmm0 movaps xmmword ptr [rsp + 16], xmm0 movaps xmmword ptr [rsp], xmm0 mov rsi, rsp mov edx, 32 call qword ptr [rip + ::try_fill_bytes@GOTPCREL] test rax, rax je .LBB8_1 # %bb.2: mov r12, rax mov r13, rdx mov rdi, rax call qword ptr [rdx] # %bb.3: mov rsi, qword ptr [r13 + 8] test rsi, rsi je .LBB8_5 # %bb.4: mov rdx, qword ptr [r13 + 16] mov rdi, r12 call qword ptr [rip + __rust_dealloc@GOTPCREL] jmp .LBB8_5 .LBB8_1: movaps xmm0, xmmword ptr [rsp] movaps xmm1, xmmword ptr [rsp + 16] movaps xmmword ptr [rsp + 144], xmm1 movaps xmmword ptr [rsp + 128], xmm0 lea rdx, [rip + .L__unnamed_2] lea rdi, [rsp + 80] lea rsi, [rsp + 128] mov ecx, 8 call qword ptr [rip + rand_chacha::guts::init_chacha@GOTPCREL] mov rax, qword ptr [rsp + 80] mov rcx, qword ptr [rsp + 120] mov qword ptr [rsp + 64], rcx movups xmm0, xmmword ptr [rsp + 104] movaps xmmword ptr [rsp + 48], xmm0 movups xmm0, xmmword ptr [rsp + 88] movaps xmmword ptr [rsp + 32], xmm0 mov rcx, qword ptr [rbx + 48] mov qword ptr [rbx + 56], rcx mov qword ptr [rbx], rax movaps xmm0, xmmword ptr [rsp + 32] movups xmmword ptr [rbx + 8], xmm0 movaps xmm0, xmmword ptr [rsp + 48] movups xmmword ptr [rbx + 24], xmm0 mov rax, qword ptr [rsp + 64] mov qword ptr [rbx + 40], rax .LBB8_5: mov qword ptr [rbx + 64], r15 mov rax, -256 add rax, qword ptr [rbx + 48] mov qword ptr [rbx + 56], rax mov rdi, rbx mov esi, 6 mov rdx, r14 call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL] add rsp, 160 pop rbx pop r12 pop r13 pop r14 pop r15 ret mov rbx, rax mov rdi, r12 mov rsi, r13 call alloc::alloc::box_free mov rdi, rbx call _Unwind_Resume@PLT ud2 # -- End function alloc::alloc::box_free: # @alloc::alloc::box_free # %bb.0: mov rax, rsi mov rsi, qword ptr [rsi + 8] test rsi, rsi je .LBB9_1 # %bb.2: mov rdx, qword ptr [rax + 16] jmp qword ptr [rip + __rust_dealloc@GOTPCREL] # TAILCALL .LBB9_1: ret # -- End function .LCPI10_0: .long 0x33800000 # float 5.96046448E-8 playground::main: # @playground::main # %bb.0: push rbp push r15 push r14 push rbx sub rsp, 72 call qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL] mov rbx, rax mov r14, rax mov qword ptr [rsp], rax mov rax, qword ptr [rax + 16] cmp rax, 64 jb .LBB10_7 # %bb.1: call qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL] # %bb.2: lea rdx, [rbx + 24] add rbx, 288 mov rcx, qword ptr [r14 + 344] test rcx, rcx jle .LBB10_4 # %bb.3: cmp qword ptr [r14 + 352], rax js .LBB10_4 # %bb.5: add rcx, -256 mov qword ptr [r14 + 344], rcx mov rdi, rbx mov esi, 6 call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL] jmp .LBB10_6 .LBB10_4: mov rdi, rbx mov rsi, rdx mov rdx, rax call rand::rngs::adapter::reseeding::ReseedingCore::reseed_and_generate .LBB10_6: mov qword ptr [r14 + 16], 0 xor eax, eax .LBB10_7: mov r15d, dword ptr [r14 + 4*rax + 24] add rax, 1 mov qword ptr [r14 + 16], rax add qword ptr [r14], -1 jne .LBB10_10 # %bb.8: add qword ptr [r14 + 8], -1 jne .LBB10_10 # %bb.9: mov esi, 368 mov edx, 16 mov rdi, r14 call qword ptr [rip + __rust_dealloc@GOTPCREL] .LBB10_10: call qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL] mov rbx, rax mov qword ptr [rsp], rax mov rax, qword ptr [rax + 16] cmp rax, 64 jb .LBB10_19 # %bb.11: call qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL] # %bb.12: lea rdx, [rbx + 24] mov rdi, rbx add rdi, 288 mov rcx, qword ptr [rbx + 344] test rcx, rcx jle .LBB10_14 # %bb.13: cmp qword ptr [rbx + 352], rax js .LBB10_14 # %bb.17: add rcx, -256 mov qword ptr [rbx + 344], rcx mov esi, 6 call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL] jmp .LBB10_18 .LBB10_14: mov rsi, rdx mov rdx, rax call rand::rngs::adapter::reseeding::ReseedingCore::reseed_and_generate .LBB10_18: mov qword ptr [rbx + 16], 0 xor eax, eax .LBB10_19: add rax, 1 mov qword ptr [rbx + 16], rax add qword ptr [rbx], -1 jne .LBB10_22 # %bb.20: add qword ptr [rbx + 8], -1 jne .LBB10_22 # %bb.21: mov esi, 368 mov edx, 16 mov rdi, rbx call qword ptr [rip + __rust_dealloc@GOTPCREL] .LBB10_22: call qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL] mov rbx, rax mov qword ptr [rsp], rax mov rax, qword ptr [rax + 16] cmp rax, 64 jb .LBB10_29 # %bb.23: call qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL] # %bb.24: lea rdx, [rbx + 24] mov rdi, rbx add rdi, 288 mov rcx, qword ptr [rbx + 344] test rcx, rcx jle .LBB10_26 # %bb.25: cmp qword ptr [rbx + 352], rax js .LBB10_26 # %bb.27: add rcx, -256 mov qword ptr [rbx + 344], rcx mov esi, 6 call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL] jmp .LBB10_28 .LBB10_26: mov rsi, rdx mov rdx, rax call rand::rngs::adapter::reseeding::ReseedingCore::reseed_and_generate .LBB10_28: mov qword ptr [rbx + 16], 0 xor eax, eax .LBB10_29: mov ebp, dword ptr [rbx + 4*rax + 24] add rax, 1 mov qword ptr [rbx + 16], rax add qword ptr [rbx], -1 jne .LBB10_32 # %bb.30: add qword ptr [rbx + 8], -1 jne .LBB10_32 # %bb.31: mov esi, 368 mov edx, 16 mov rdi, rbx call qword ptr [rip + __rust_dealloc@GOTPCREL] .LBB10_32: call qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL] mov rbx, rax mov qword ptr [rsp], rax mov rax, qword ptr [rax + 16] cmp rax, 64 jb .LBB10_39 # %bb.33: call qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL] # %bb.34: lea rdx, [rbx + 24] mov rdi, rbx add rdi, 288 mov rcx, qword ptr [rbx + 344] test rcx, rcx jle .LBB10_36 # %bb.35: cmp qword ptr [rbx + 352], rax js .LBB10_36 # %bb.37: add rcx, -256 mov qword ptr [rbx + 344], rcx mov esi, 6 call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL] jmp .LBB10_38 .LBB10_36: mov rsi, rdx mov rdx, rax call rand::rngs::adapter::reseeding::ReseedingCore::reseed_and_generate .LBB10_38: mov qword ptr [rbx + 16], 0 xor eax, eax .LBB10_39: add rax, 1 mov qword ptr [rbx + 16], rax add qword ptr [rbx], -1 jne .LBB10_42 # %bb.40: add qword ptr [rbx + 8], -1 jne .LBB10_42 # %bb.41: mov esi, 368 mov edx, 16 mov rdi, rbx call qword ptr [rip + __rust_dealloc@GOTPCREL] .LBB10_42: shr ebp, 8 cvtsi2ss xmm0, ebp shr r15d, 8 cvtsi2ss xmm1, r15d movss xmm2, dword ptr [rip + .LCPI10_0] # xmm2 = mem[0],zero,zero,zero mulss xmm0, xmm2 mulss xmm1, xmm2 unpcklps xmm1, xmm0 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] movlps qword ptr [rsp + 64], xmm1 lea rax, [rip + .L__unnamed_3] mov qword ptr [rsp + 48], rax lea rax, [rip + core::fmt::num::::fmt] mov qword ptr [rsp + 56], rax lea rax, [rip + .L__unnamed_4] mov qword ptr [rsp], rax mov qword ptr [rsp + 8], 2 mov qword ptr [rsp + 16], 0 lea rbx, [rsp + 48] mov qword ptr [rsp + 32], rbx mov qword ptr [rsp + 40], 1 mov rbp, qword ptr [rip + std::io::stdio::_print@GOTPCREL] mov rdi, rsp call rbp lea rax, [rsp + 64] mov qword ptr [rsp + 48], rax lea rax, [rip + ::fmt] mov qword ptr [rsp + 56], rax lea rax, [rip + .L__unnamed_5] mov qword ptr [rsp], rax mov qword ptr [rsp + 8], 2 mov qword ptr [rsp + 16], 0 mov qword ptr [rsp + 32], rbx mov qword ptr [rsp + 40], 1 mov rdi, rsp call rbp add rsp, 72 pop rbx pop r14 pop r15 pop rbp ret jmp .LBB10_16 jmp .LBB10_16 jmp .LBB10_16 .LBB10_16: mov rbx, rax mov rdi, rsp call core::ptr::drop_in_place mov rdi, rbx call _Unwind_Resume@PLT ud2 # -- End function ::fmt: # @"::fmt" # %bb.0: push r15 push r14 push r12 push rbx sub rsp, 40 mov rbx, rdi lea r15, [rdi + 4] lea rdx, [rip + .L__unnamed_6] lea r14, [rsp + 16] mov ecx, 5 mov rdi, r14 call qword ptr [rip + core::fmt::Formatter::debug_tuple@GOTPCREL] mov qword ptr [rsp + 8], rbx lea rbx, [rip + .L__unnamed_7] mov r12, qword ptr [rip + core::fmt::builders::DebugTuple::field@GOTPCREL] lea rsi, [rsp + 8] mov rdi, r14 mov rdx, rbx call r12 mov qword ptr [rsp + 8], r15 lea rsi, [rsp + 8] mov rdi, r14 mov rdx, rbx call r12 mov rdi, r14 call qword ptr [rip + core::fmt::builders::DebugTuple::finish@GOTPCREL] add rsp, 40 pop rbx pop r12 pop r14 pop r15 ret # -- End function main: # @main # %bb.0: sub rsp, 8 mov rcx, rsi movsxd rdx, edi lea rax, [rip + playground::main] mov qword ptr [rsp], rax lea rsi, [rip + .L__unnamed_1] mov rdi, rsp call qword ptr [rip + std::rt::lang_start_internal@GOTPCREL] # kill: def $eax killed $eax killed $rax pop rcx ret # -- End function .L__unnamed_1: .quad core::ptr::drop_in_place<&f32> .quad 8 # 0x8 .quad 8 # 0x8 .quad std::rt::lang_start::{{closure}} .quad std::rt::lang_start::{{closure}} .quad core::ops::function::FnOnce::call_once{{vtable.shim}} .L__unnamed_2: .zero 8 .L__unnamed_8: .ascii "Alignment is: " .L__unnamed_9: .byte 10 .L__unnamed_4: .quad .L__unnamed_8 .asciz "\016\000\000\000\000\000\000" .quad .L__unnamed_9 .asciz "\001\000\000\000\000\000\000" .L__unnamed_3: .asciz "\b\000\000\000\000\000\000" .L__unnamed_10: .ascii "Data is: " .L__unnamed_5: .quad .L__unnamed_10 .asciz "\t\000\000\000\000\000\000" .quad .L__unnamed_9 .asciz "\001\000\000\000\000\000\000" .L__unnamed_6: .ascii "f32x2" .L__unnamed_7: .quad core::ptr::drop_in_place<&f32> .quad 8 # 0x8 .quad 8 # 0x8 .quad <&T as core::fmt::Debug>::fmt ```
workingjubilee commented 2 years ago

For note, here: having done some thought and research, it's not clear using a higher alignment is actually desired in practice. It may be the case that actually the desired alignment for vectors which do not precisely align to the machine's vector sizes is lower, around the element size (as if it were an array), possibly unless it uses a power of 2 element count (as it does here).