repr(simd) does not align to Intel recs on x86_64

I tried this code: (Playground). The random inputs are mostly just to keep the compiler fairly "honest" and block optimizing away the instructions it would use.

#![feature(repr_simd)]
#![feature(platform_intrinsics)]
use rand::random;

#[derive(Debug)]
#[repr(simd)]
struct f32x2(f32, f32);

extern "platform-intrinsic" {
    fn simd_shuffle2<T, U>(a: T, b: T, idx: [u32; 2]) -> U;
}

fn main() {
    let x = f32x2(rand::random(), rand::random());
    let y = f32x2(rand::random(), rand::random());
    let z: f32x2 = unsafe { simd_shuffle2(x, y, [0, 2]) };
    println!("Alignment is: {:?}", std::mem::align_of::<f32x2>());
    println!("Data is: {:?}", z);
}

For best performance, the Streaming SIMD Extensions and Streaming SIMD Extensions 2 require their memory operands to be aligned to 16-byte boundaries.

Thus, I expected to see this happen:

Alignment is: 16
Data is: f32x2(0.12946808, 0.4856578)

Instead, this happened:

Alignment is: 8
Data is: f32x2(0.12946808, 0.4856578)

That does not appear to be the correct alignment to report for this type, unless I am misunderstanding something here.

Meta

rustc --version --verbose:

rustc 1.52.0-nightly (0fc6756b4 2021-02-08)
binary: rustc
commit-hash: 0fc6756b42e0556cc2e18079f5fc6b4d58f4e81a
commit-date: 2021-02-08
host: x86_64-unknown-linux-gnu
release: 1.52.0-nightly
LLVM version: 11.0.1

I believe this is related to, but not exactly the same as, #27060. Apologies if this is a total duplicate, or if I am misunderstanding something here about what Rust means by "alignment", but after careful review with @calebzulawski, we started to arrive at the conclusion that something was off.

Here is the generated assembly, as you can see, it uses multiple SSE instructions, including movaps, an aligned load, but I haven't exhaustively analyzed it so I can't immediately tell if actual alignment requirements are being adhered to here and I am just spooked by the seemingly misleading information.

x86_64 Assembly

```asm std::sys_common::backtrace::__rust_begin_short_backtrace: # @std::sys_common::backtrace::__rust_begin_short_backtrace # %bb.0: sub rsp, 8 call rdi mov rax, rsp #APP #NO_APP pop rax ret # -- End function std::rt::lang_start: # @std::rt::lang_start # %bb.0: sub rsp, 8 mov rcx, rdx mov rdx, rsi mov qword ptr [rsp], rdi lea rsi, [rip + .L__unnamed_1] mov rdi, rsp call qword ptr [rip + std::rt::lang_start_internal@GOTPCREL] pop rcx ret # -- End function std::rt::lang_start::{{closure}}: # @"std::rt::lang_start::{{closure}}" # %bb.0: sub rsp, 8 mov rdi, qword ptr [rdi] call std::sys_common::backtrace::__rust_begin_short_backtrace xor eax, eax pop rcx ret # -- End function <&T as core::fmt::Debug>::fmt: # @"<&T as core::fmt::Debug>::fmt" # %bb.0: mov rdi, qword ptr [rdi] jmp qword ptr [rip + core::fmt::float::::fmt@GOTPCREL] # TAILCALL # -- End function core::fmt::num::::fmt: # @"core::fmt::num::::fmt" # %bb.0: push r14 push rbx sub rsp, 8 mov rbx, rsi mov r14, rdi mov rdi, rsi call qword ptr [rip + core::fmt::Formatter::debug_lower_hex@GOTPCREL] test al, al je .LBB4_1 # %bb.3: mov rdi, r14 mov rsi, rbx add rsp, 8 pop rbx pop r14 jmp qword ptr [rip + core::fmt::num::::fmt@GOTPCREL] # TAILCALL .LBB4_1: mov rdi, rbx call qword ptr [rip + core::fmt::Formatter::debug_upper_hex@GOTPCREL] mov rdi, r14 mov rsi, rbx add rsp, 8 test al, al je .LBB4_2 # %bb.4: pop rbx pop r14 jmp qword ptr [rip + core::fmt::num::::fmt@GOTPCREL] # TAILCALL .LBB4_2: pop rbx pop r14 jmp qword ptr [rip + core::fmt::num::imp::::fmt@GOTPCREL] # TAILCALL # -- End function core::ops::function::FnOnce::call_once{{vtable.shim}}: # @"core::ops::function::FnOnce::call_once{{vtable.shim}}" # %bb.0: sub rsp, 8 mov rdi, qword ptr [rdi] call std::sys_common::backtrace::__rust_begin_short_backtrace xor eax, eax pop rcx ret # -- End function core::ptr::drop_in_place<&f32>: # @"core::ptr::drop_in_place<&f32>" # %bb.0: ret # -- End function core::ptr::drop_in_place: # @"core::ptr::drop_in_place" # %bb.0: mov rax, qword ptr [rdi] add qword ptr [rax], -1 mov rax, qword ptr [rdi] cmp qword ptr [rax], 0 jne .LBB7_2 # %bb.1: add qword ptr [rax + 8], -1 mov rdi, qword ptr [rdi] cmp qword ptr [rdi + 8], 0 je .LBB7_3 .LBB7_2: ret .LBB7_3: mov esi, 368 mov edx, 16 jmp qword ptr [rip + __rust_dealloc@GOTPCREL] # TAILCALL # -- End function rand::rngs::adapter::reseeding::ReseedingCore::reseed_and_generate: # @"rand::rngs::adapter::reseeding::ReseedingCore::reseed_and_generate" # %bb.0: push r15 push r14 push r13 push r12 push rbx sub rsp, 160 mov r15, rdx mov r14, rsi mov rbx, rdi xorps xmm0, xmm0 movaps xmmword ptr [rsp + 16], xmm0 movaps xmmword ptr [rsp], xmm0 mov rsi, rsp mov edx, 32 call qword ptr [rip + ::try_fill_bytes@GOTPCREL] test rax, rax je .LBB8_1 # %bb.2: mov r12, rax mov r13, rdx mov rdi, rax call qword ptr [rdx] # %bb.3: mov rsi, qword ptr [r13 + 8] test rsi, rsi je .LBB8_5 # %bb.4: mov rdx, qword ptr [r13 + 16] mov rdi, r12 call qword ptr [rip + __rust_dealloc@GOTPCREL] jmp .LBB8_5 .LBB8_1: movaps xmm0, xmmword ptr [rsp] movaps xmm1, xmmword ptr [rsp + 16] movaps xmmword ptr [rsp + 144], xmm1 movaps xmmword ptr [rsp + 128], xmm0 lea rdx, [rip + .L__unnamed_2] lea rdi, [rsp + 80] lea rsi, [rsp + 128] mov ecx, 8 call qword ptr [rip + rand_chacha::guts::init_chacha@GOTPCREL] mov rax, qword ptr [rsp + 80] mov rcx, qword ptr [rsp + 120] mov qword ptr [rsp + 64], rcx movups xmm0, xmmword ptr [rsp + 104] movaps xmmword ptr [rsp + 48], xmm0 movups xmm0, xmmword ptr [rsp + 88] movaps xmmword ptr [rsp + 32], xmm0 mov rcx, qword ptr [rbx + 48] mov qword ptr [rbx + 56], rcx mov qword ptr [rbx], rax movaps xmm0, xmmword ptr [rsp + 32] movups xmmword ptr [rbx + 8], xmm0 movaps xmm0, xmmword ptr [rsp + 48] movups xmmword ptr [rbx + 24], xmm0 mov rax, qword ptr [rsp + 64] mov qword ptr [rbx + 40], rax .LBB8_5: mov qword ptr [rbx + 64], r15 mov rax, -256 add rax, qword ptr [rbx + 48] mov qword ptr [rbx + 56], rax mov rdi, rbx mov esi, 6 mov rdx, r14 call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL] add rsp, 160 pop rbx pop r12 pop r13 pop r14 pop r15 ret mov rbx, rax mov rdi, r12 mov rsi, r13 call alloc::alloc::box_free mov rdi, rbx call _Unwind_Resume@PLT ud2 # -- End function alloc::alloc::box_free: # @alloc::alloc::box_free # %bb.0: mov rax, rsi mov rsi, qword ptr [rsi + 8] test rsi, rsi je .LBB9_1 # %bb.2: mov rdx, qword ptr [rax + 16] jmp qword ptr [rip + __rust_dealloc@GOTPCREL] # TAILCALL .LBB9_1: ret # -- End function .LCPI10_0: .long 0x33800000 # float 5.96046448E-8 playground::main: # @playground::main # %bb.0: push rbp push r15 push r14 push rbx sub rsp, 72 call qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL] mov rbx, rax mov r14, rax mov qword ptr [rsp], rax mov rax, qword ptr [rax + 16] cmp rax, 64 jb .LBB10_7 # %bb.1: call qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL] # %bb.2: lea rdx, [rbx + 24] add rbx, 288 mov rcx, qword ptr [r14 + 344] test rcx, rcx jle .LBB10_4 # %bb.3: cmp qword ptr [r14 + 352], rax js .LBB10_4 # %bb.5: add rcx, -256 mov qword ptr [r14 + 344], rcx mov rdi, rbx mov esi, 6 call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL] jmp .LBB10_6 .LBB10_4: mov rdi, rbx mov rsi, rdx mov rdx, rax call rand::rngs::adapter::reseeding::ReseedingCore::reseed_and_generate .LBB10_6: mov qword ptr [r14 + 16], 0 xor eax, eax .LBB10_7: mov r15d, dword ptr [r14 + 4*rax + 24] add rax, 1 mov qword ptr [r14 + 16], rax add qword ptr [r14], -1 jne .LBB10_10 # %bb.8: add qword ptr [r14 + 8], -1 jne .LBB10_10 # %bb.9: mov esi, 368 mov edx, 16 mov rdi, r14 call qword ptr [rip + __rust_dealloc@GOTPCREL] .LBB10_10: call qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL] mov rbx, rax mov qword ptr [rsp], rax mov rax, qword ptr [rax + 16] cmp rax, 64 jb .LBB10_19 # %bb.11: call qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL] # %bb.12: lea rdx, [rbx + 24] mov rdi, rbx add rdi, 288 mov rcx, qword ptr [rbx + 344] test rcx, rcx jle .LBB10_14 # %bb.13: cmp qword ptr [rbx + 352], rax js .LBB10_14 # %bb.17: add rcx, -256 mov qword ptr [rbx + 344], rcx mov esi, 6 call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL] jmp .LBB10_18 .LBB10_14: mov rsi, rdx mov rdx, rax call rand::rngs::adapter::reseeding::ReseedingCore::reseed_and_generate .LBB10_18: mov qword ptr [rbx + 16], 0 xor eax, eax .LBB10_19: add rax, 1 mov qword ptr [rbx + 16], rax add qword ptr [rbx], -1 jne .LBB10_22 # %bb.20: add qword ptr [rbx + 8], -1 jne .LBB10_22 # %bb.21: mov esi, 368 mov edx, 16 mov rdi, rbx call qword ptr [rip + __rust_dealloc@GOTPCREL] .LBB10_22: call qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL] mov rbx, rax mov qword ptr [rsp], rax mov rax, qword ptr [rax + 16] cmp rax, 64 jb .LBB10_29 # %bb.23: call qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL] # %bb.24: lea rdx, [rbx + 24] mov rdi, rbx add rdi, 288 mov rcx, qword ptr [rbx + 344] test rcx, rcx jle .LBB10_26 # %bb.25: cmp qword ptr [rbx + 352], rax js .LBB10_26 # %bb.27: add rcx, -256 mov qword ptr [rbx + 344], rcx mov esi, 6 call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL] jmp .LBB10_28 .LBB10_26: mov rsi, rdx mov rdx, rax call rand::rngs::adapter::reseeding::ReseedingCore::reseed_and_generate .LBB10_28: mov qword ptr [rbx + 16], 0 xor eax, eax .LBB10_29: mov ebp, dword ptr [rbx + 4*rax + 24] add rax, 1 mov qword ptr [rbx + 16], rax add qword ptr [rbx], -1 jne .LBB10_32 # %bb.30: add qword ptr [rbx + 8], -1 jne .LBB10_32 # %bb.31: mov esi, 368 mov edx, 16 mov rdi, rbx call qword ptr [rip + __rust_dealloc@GOTPCREL] .LBB10_32: call qword ptr [rip + rand::rngs::thread::thread_rng@GOTPCREL] mov rbx, rax mov qword ptr [rsp], rax mov rax, qword ptr [rax + 16] cmp rax, 64 jb .LBB10_39 # %bb.33: call qword ptr [rip + rand::rngs::adapter::reseeding::fork::get_fork_counter@GOTPCREL] # %bb.34: lea rdx, [rbx + 24] mov rdi, rbx add rdi, 288 mov rcx, qword ptr [rbx + 344] test rcx, rcx jle .LBB10_36 # %bb.35: cmp qword ptr [rbx + 352], rax js .LBB10_36 # %bb.37: add rcx, -256 mov qword ptr [rbx + 344], rcx mov esi, 6 call qword ptr [rip + rand_chacha::guts::refill_wide@GOTPCREL] jmp .LBB10_38 .LBB10_36: mov rsi, rdx mov rdx, rax call rand::rngs::adapter::reseeding::ReseedingCore::reseed_and_generate .LBB10_38: mov qword ptr [rbx + 16], 0 xor eax, eax .LBB10_39: add rax, 1 mov qword ptr [rbx + 16], rax add qword ptr [rbx], -1 jne .LBB10_42 # %bb.40: add qword ptr [rbx + 8], -1 jne .LBB10_42 # %bb.41: mov esi, 368 mov edx, 16 mov rdi, rbx call qword ptr [rip + __rust_dealloc@GOTPCREL] .LBB10_42: shr ebp, 8 cvtsi2ss xmm0, ebp shr r15d, 8 cvtsi2ss xmm1, r15d movss xmm2, dword ptr [rip + .LCPI10_0] # xmm2 = mem[0],zero,zero,zero mulss xmm0, xmm2 mulss xmm1, xmm2 unpcklps xmm1, xmm0 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] movlps qword ptr [rsp + 64], xmm1 lea rax, [rip + .L__unnamed_3] mov qword ptr [rsp + 48], rax lea rax, [rip + core::fmt::num::::fmt] mov qword ptr [rsp + 56], rax lea rax, [rip + .L__unnamed_4] mov qword ptr [rsp], rax mov qword ptr [rsp + 8], 2 mov qword ptr [rsp + 16], 0 lea rbx, [rsp + 48] mov qword ptr [rsp + 32], rbx mov qword ptr [rsp + 40], 1 mov rbp, qword ptr [rip + std::io::stdio::_print@GOTPCREL] mov rdi, rsp call rbp lea rax, [rsp + 64] mov qword ptr [rsp + 48], rax lea rax, [rip + ::fmt] mov qword ptr [rsp + 56], rax lea rax, [rip + .L__unnamed_5] mov qword ptr [rsp], rax mov qword ptr [rsp + 8], 2 mov qword ptr [rsp + 16], 0 mov qword ptr [rsp + 32], rbx mov qword ptr [rsp + 40], 1 mov rdi, rsp call rbp add rsp, 72 pop rbx pop r14 pop r15 pop rbp ret jmp .LBB10_16 jmp .LBB10_16 jmp .LBB10_16 .LBB10_16: mov rbx, rax mov rdi, rsp call core::ptr::drop_in_place mov rdi, rbx call _Unwind_Resume@PLT ud2 # -- End function ::fmt: # @"::fmt" # %bb.0: push r15 push r14 push r12 push rbx sub rsp, 40 mov rbx, rdi lea r15, [rdi + 4] lea rdx, [rip + .L__unnamed_6] lea r14, [rsp + 16] mov ecx, 5 mov rdi, r14 call qword ptr [rip + core::fmt::Formatter::debug_tuple@GOTPCREL] mov qword ptr [rsp + 8], rbx lea rbx, [rip + .L__unnamed_7] mov r12, qword ptr [rip + core::fmt::builders::DebugTuple::field@GOTPCREL] lea rsi, [rsp + 8] mov rdi, r14 mov rdx, rbx call r12 mov qword ptr [rsp + 8], r15 lea rsi, [rsp + 8] mov rdi, r14 mov rdx, rbx call r12 mov rdi, r14 call qword ptr [rip + core::fmt::builders::DebugTuple::finish@GOTPCREL] add rsp, 40 pop rbx pop r12 pop r14 pop r15 ret # -- End function main: # @main # %bb.0: sub rsp, 8 mov rcx, rsi movsxd rdx, edi lea rax, [rip + playground::main] mov qword ptr [rsp], rax lea rsi, [rip + .L__unnamed_1] mov rdi, rsp call qword ptr [rip + std::rt::lang_start_internal@GOTPCREL] # kill: def $eax killed $eax killed $rax pop rcx ret # -- End function .L__unnamed_1: .quad core::ptr::drop_in_place<&f32> .quad 8 # 0x8 .quad 8 # 0x8 .quad std::rt::lang_start::{{closure}} .quad std::rt::lang_start::{{closure}} .quad core::ops::function::FnOnce::call_once{{vtable.shim}} .L__unnamed_2: .zero 8 .L__unnamed_8: .ascii "Alignment is: " .L__unnamed_9: .byte 10 .L__unnamed_4: .quad .L__unnamed_8 .asciz "\016\000\000\000\000\000\000" .quad .L__unnamed_9 .asciz "\001\000\000\000\000\000\000" .L__unnamed_3: .asciz "\b\000\000\000\000\000\000" .L__unnamed_10: .ascii "Data is: " .L__unnamed_5: .quad .L__unnamed_10 .asciz "\t\000\000\000\000\000\000" .quad .L__unnamed_9 .asciz "\001\000\000\000\000\000\000" .L__unnamed_6: .ascii "f32x2" .L__unnamed_7: .quad core::ptr::drop_in_place<&f32> .quad 8 # 0x8 .quad 8 # 0x8 .quad <&T as core::fmt::Debug>::fmt ```

rust-lang / rust

repr(simd) does not align to Intel recs on x86_64 #81931

Meta