[Note] Investigating "transmute slower than intrinsics for vector unpacking"

https://github.com/BurntSushi/aho-corasick/blob/4e7fa3b85dd3a3ce882896f1d4ee22b1f271f0b4/src/packed/vector.rs#L91-L106

Minimal reproduction:

use std::arch::x86_64::*;

#[target_feature(enable = "avx2")]
pub unsafe fn unpack64x256(a: __m256i) -> [u64; 4] {
    let lo = _mm256_extracti128_si256(a, 0);
    let hi = _mm256_extracti128_si256(a, 1);
    [
        _mm_cvtsi128_si64(lo) as u64,
        _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64,
        _mm_cvtsi128_si64(hi) as u64,
        _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64,
    ]
}

#[target_feature(enable = "avx2")]
pub unsafe fn unpack64x256_transmute(a: __m256i) -> [u64; 4] {
    std::mem::transmute(a)
}

pub unsafe fn unpack64x256_transmute_no_feature(a: __m256i) -> [u64; 4] {
    std::mem::transmute(a)
}

Resulting assembly (godbolt):

example::unpack64x256:
        mov     rax, rdi
        vmovaps ymm0, ymmword ptr [rsi]
        vmovups ymmword ptr [rdi], ymm0
        vzeroupper
        ret

example::unpack64x256_transmute:
        mov     rax, rdi
        vmovaps ymm0, ymmword ptr [rsi]
        vmovups ymmword ptr [rdi], ymm0
        vzeroupper
        ret

example::unpack64x256_transmute_no_feature:
        mov     rax, rdi
        movaps  xmm0, xmmword ptr [rsi]
        movaps  xmm1, xmmword ptr [rsi + 16]
        movups  xmmword ptr [rdi + 16], xmm1
        movups  xmmword ptr [rdi], xmm0
        ret

Link: https://rust.godbolt.org/z/jnr81hGnq

It would appear that (as of at minimum Rust 1.60.0, the current MSRV) transmute optimizes to the exact same asm as the intrinsics, but only when the avx2 feature is specifically enabled on the function. (When optimization is disabled, the transmute versions are only a few lines longer but the intrinsic version is over 70 lines of ASM and multiple function calls due to the intrinsics not inlining)

Changing godbolt back to rust 1.27.1 (the earliest stable simd intrinsics build), the asm did used to be different between the two:

example::unpack64x256:
        vmovaps ymm0, ymmword ptr [rsi]
        vextractf128    xmm1, ymm0, 1
        vpermilps       xmm2, xmm0, 78
        vpermilps       xmm1, xmm1, 78
        vunpcklpd       ymm0, ymm0, ymm2
        vbroadcastsd    ymm1, xmm1
        vblendps        ymm0, ymm0, ymm1, 192
        vmovups ymmword ptr [rdi], ymm0
        mov     rax, rdi
        vzeroupper
        ret

example::unpack64x256_transmute:
        vmovaps ymm0, ymmword ptr [rsi]
        vmovups ymmword ptr [rdi], ymm0
        mov     rax, rdi
        vzeroupper
        ret

example::unpack64x256_transmute_no_feature:
        movaps  xmm0, xmmword ptr [rsi]
        movaps  xmm1, xmmword ptr [rsi + 16]
        movups  xmmword ptr [rdi + 16], xmm1
        movups  xmmword ptr [rdi], xmm0
        mov     rax, rdi
        ret

Interestingly the modern optimization is using a very minor tweak of the version that was commented as being slower.

Not a bug or a necessary change, just something I found interesting.

BurntSushi / aho-corasick

[Note] Investigating "transmute slower than intrinsics for vector unpacking" #103