[SR-12777] SIMD generates bad code for string processing use-cases

Lukasa commented 4 years ago


Previous ID	SR-12777
Radar	rdar://problem/63076883
Original Reporter	@Lukasa
Type	Bug

Attachment: Download

Additional Detail from JIRA

| | | |------------------|-----------------| |Votes | 1 | |Component/s | Standard Library | |Labels | Bug, simd | |Assignee | None | |Priority | Medium | md5: c21b41dce5f7de3c531660d78015bffb

Issue Description:

I recently investigated using the SIMD types to implement a generic SIMD vectorised string search. To get a feel for how well or poorly the SIMD types were doing I compared them against two other implementations: a pure Swift byte wise string search, and a C implementation directly using the Intel intrinsics. The expectation is that the C implementation is a "best case" and the SIMD one would be judged by how close it could get to that C implementation from the baseline of the byte wise search. This did not attempt to solve the problem in full generality, dealing with alignment etc, as I just wanted to investigate what the order of magnitude of the wins might be.

My benchmark package is attached.

Here are my results, searching a 1kB buffer for a byte that is in the last byte:

Pure Swift, 1kB, last
Duration: nanoseconds(933)
Swift SIMD, 1kB, last
Duration: nanoseconds(15925)
Intel SIMD, 1kB, last
Duration: nanoseconds(155)

I was surprised by how poorly the SIMD implementation performed, so I extracted the common implementations into Godbolt. The idealised C implementation is here. The byte wise Swift implementation is here. The Swift SIMD implementation is here.

A part of the performance cost here is that there is no apparent way to transform a SIMDMask into a bitmask. This limits my ability to query it to checking every SIMD lane manually, which is quite expensive. I tried to work around this with a comparison against the zero mask, but that comparison also appears to be done lanewise, another strange performance issue.

Finally, I was a bit startled at exactly how much code was needed to initialise the SIMD32s from the UnsafeRawBufferPointer. While I didn't expect C's level of brevity, the amount of code involved here is so high that if I add even a few extra lines of code to the Swift function the SIMD32 initialiser gets outlined for code size reasons.

The result of all of this is that SIMD appears to be unusable for string processing tasks at this time, unless I've missed something substantial about how it's supposed to be used.

Lukasa commented 4 years ago

@swift-ci create

Lukasa commented 4 years ago

As a useful comparator bar, here's equivalent Rust code (actually somewhat more complex as it deals with the last 32 bytes):

pub fn search_simd(haystack: &[u8], needle: u8) -> Option<usize> {
    let mut offset = 0;
    let needles = packed_simd::u8x32::splat(needle);
    while haystack.len() - offset >= 32 {
        let elements = packed_simd::u8x32::from_slice_unaligned(&haystack[offset..]);
        let matches = elements.eq(needles);
        if matches.any() {
            return Some(offset + matches.bitmask().trailing_zeros() as usize);
        }
        offset += 32
    }
    for (i, &el) in haystack[offset..].iter().enumerate() {
        if el == needle {
            return Some(offset + i);
        }
    }
    return None;
}

And the generated assembly:


    .text
    .file   "f.6oqy8ea2-cgu.0"
    .section    .text._ZN1f11search_simd17h87e16be8da9e7b80E,"ax",@progbits
    .globl  _ZN1f11search_simd17h87e16be8da9e7b80E
    .p2align    4, 0x90
    .type   _ZN1f11search_simd17h87e16be8da9e7b80E,@function
_ZN1f11search_simd17h87e16be8da9e7b80E:
    .cfi_startproc
    pushq   %rax
    .cfi_def_cfa_offset 16
    movl    %edx, %r8d
    cmpq    $32, %rsi
    jb  .LBB0_1
    vmovd   %r8d, %xmm0
    vpbroadcastb    %xmm0, %ymm0
    movl    $32, %r9d
    xorl    %edx, %edx
    xorl    %ecx, %ecx
    .p2align    4, 0x90
.LBB0_9:
    testb   $1, %cl
    jne .LBB0_10
    movq    %r9, %rax
    vpcmpeqb    -32(%rdi,%r9), %ymm0, %ymm1
    vpmovmskb   %xmm1, %ecx
    testw   %cx, %cx
    jne .LBB0_14
    vextracti128    $1, %ymm1, %xmm2
    vpmovmskb   %xmm2, %ecx
    testw   %cx, %cx
    jne .LBB0_14
    cmpq    %rsi, %rax
    seta    %cl
    leaq    32(%rax), %r9
    leaq    (%rsi,%rdx), %r10
    addq    $-32, %r10
    addq    $-32, %rdx
    cmpq    $31, %r10
    ja  .LBB0_9
    negq    %rdx
    cmpq    %rsi, %rax
    jbe .LBB0_2
    leaq    .L__unnamed_1(%rip), %rax
    movq    %rdx, %rdi
    movq    %rax, %rdx
    vzeroupper
    callq   *_ZN4core5slice22slice_index_order_fail17h05c218501708eed7E@GOTPCREL(%rip)
    ud2
.LBB0_1:
    xorl    %edx, %edx
.LBB0_2:
    decq    %rdx
    negq    %rsi
    xorl    %eax, %eax
    .p2align    4, 0x90
.LBB0_3:
    leaq    (%rsi,%rdx), %rcx
    cmpq    $-1, %rcx
    je  .LBB0_4
    cmpb    %r8b, 1(%rdi,%rdx)
    leaq    1(%rdx), %rdx
    jne .LBB0_3
    movl    $1, %eax
    popq    %rcx
    .cfi_def_cfa_offset 8
    vzeroupper
    retq
.LBB0_14:
    .cfi_def_cfa_offset 16
    vpmovmskb   %ymm1, %eax
    xorl    %ecx, %ecx
    tzcntl  %eax, %ecx
    subq    %rdx, %rcx
    movl    $1, %eax
    movq    %rcx, %rdx
    popq    %rcx
    .cfi_def_cfa_offset 8
    vzeroupper
    retq
.LBB0_4:
    .cfi_def_cfa_offset 16
    popq    %rcx
    .cfi_def_cfa_offset 8
    vzeroupper
    retq
.LBB0_10:
    .cfi_def_cfa_offset 16
    negq    %rdx
    leaq    .L__unnamed_2(%rip), %rax
    movq    %rdx, %rdi
    movq    %rax, %rdx
    vzeroupper
    callq   *_ZN4core5slice22slice_index_order_fail17h05c218501708eed7E@GOTPCREL(%rip)
    ud2
.Lfunc_end0:
    .size   _ZN1f11search_simd17h87e16be8da9e7b80E, .Lfunc_end0-_ZN1f11search_simd17h87e16be8da9e7b80E
    .cfi_endproc

    .type   .L__unnamed_3,@object
    .section    .rodata..L__unnamed_3,"a",@progbits
.L__unnamed_3:
    .ascii  "src/lib.rs"
    .size   .L__unnamed_3, 10

    .type   .L__unnamed_2,@object
    .section    .data.rel.ro..L__unnamed_2,"aw",@progbits
    .p2align    3
.L__unnamed_2:
    .quad   .L__unnamed_3
    .asciz  "\n\000\000\000\000\000\000\000\005\000\000\000B\000\000"
    .size   .L__unnamed_2, 24

    .type   .L__unnamed_1,@object
    .section    .data.rel.ro..L__unnamed_1,"aw",@progbits
    .p2align    3
.L__unnamed_1:
    .quad   .L__unnamed_3
    .asciz  "\n\000\000\000\000\000\000\000\f\000\000\000\025\000\000"
    .size   .L__unnamed_1, 24

    .section    ".note.GNU-stack","",@progbits

typesanitizer commented 4 years ago

cc @stephentyrone

swiftlang / swift

[SR-12777] SIMD generates bad code for string processing use-cases #55222