Open Lukasa opened 4 years ago
@swift-ci create
As a useful comparator bar, here's equivalent Rust code (actually somewhat more complex as it deals with the last 32 bytes):
pub fn search_simd(haystack: &[u8], needle: u8) -> Option<usize> {
let mut offset = 0;
let needles = packed_simd::u8x32::splat(needle);
while haystack.len() - offset >= 32 {
let elements = packed_simd::u8x32::from_slice_unaligned(&haystack[offset..]);
let matches = elements.eq(needles);
if matches.any() {
return Some(offset + matches.bitmask().trailing_zeros() as usize);
}
offset += 32
}
for (i, &el) in haystack[offset..].iter().enumerate() {
if el == needle {
return Some(offset + i);
}
}
return None;
}
And the generated assembly:
.text
.file "f.6oqy8ea2-cgu.0"
.section .text._ZN1f11search_simd17h87e16be8da9e7b80E,"ax",@progbits
.globl _ZN1f11search_simd17h87e16be8da9e7b80E
.p2align 4, 0x90
.type _ZN1f11search_simd17h87e16be8da9e7b80E,@function
_ZN1f11search_simd17h87e16be8da9e7b80E:
.cfi_startproc
pushq %rax
.cfi_def_cfa_offset 16
movl %edx, %r8d
cmpq $32, %rsi
jb .LBB0_1
vmovd %r8d, %xmm0
vpbroadcastb %xmm0, %ymm0
movl $32, %r9d
xorl %edx, %edx
xorl %ecx, %ecx
.p2align 4, 0x90
.LBB0_9:
testb $1, %cl
jne .LBB0_10
movq %r9, %rax
vpcmpeqb -32(%rdi,%r9), %ymm0, %ymm1
vpmovmskb %xmm1, %ecx
testw %cx, %cx
jne .LBB0_14
vextracti128 $1, %ymm1, %xmm2
vpmovmskb %xmm2, %ecx
testw %cx, %cx
jne .LBB0_14
cmpq %rsi, %rax
seta %cl
leaq 32(%rax), %r9
leaq (%rsi,%rdx), %r10
addq $-32, %r10
addq $-32, %rdx
cmpq $31, %r10
ja .LBB0_9
negq %rdx
cmpq %rsi, %rax
jbe .LBB0_2
leaq .L__unnamed_1(%rip), %rax
movq %rdx, %rdi
movq %rax, %rdx
vzeroupper
callq *_ZN4core5slice22slice_index_order_fail17h05c218501708eed7E@GOTPCREL(%rip)
ud2
.LBB0_1:
xorl %edx, %edx
.LBB0_2:
decq %rdx
negq %rsi
xorl %eax, %eax
.p2align 4, 0x90
.LBB0_3:
leaq (%rsi,%rdx), %rcx
cmpq $-1, %rcx
je .LBB0_4
cmpb %r8b, 1(%rdi,%rdx)
leaq 1(%rdx), %rdx
jne .LBB0_3
movl $1, %eax
popq %rcx
.cfi_def_cfa_offset 8
vzeroupper
retq
.LBB0_14:
.cfi_def_cfa_offset 16
vpmovmskb %ymm1, %eax
xorl %ecx, %ecx
tzcntl %eax, %ecx
subq %rdx, %rcx
movl $1, %eax
movq %rcx, %rdx
popq %rcx
.cfi_def_cfa_offset 8
vzeroupper
retq
.LBB0_4:
.cfi_def_cfa_offset 16
popq %rcx
.cfi_def_cfa_offset 8
vzeroupper
retq
.LBB0_10:
.cfi_def_cfa_offset 16
negq %rdx
leaq .L__unnamed_2(%rip), %rax
movq %rdx, %rdi
movq %rax, %rdx
vzeroupper
callq *_ZN4core5slice22slice_index_order_fail17h05c218501708eed7E@GOTPCREL(%rip)
ud2
.Lfunc_end0:
.size _ZN1f11search_simd17h87e16be8da9e7b80E, .Lfunc_end0-_ZN1f11search_simd17h87e16be8da9e7b80E
.cfi_endproc
.type .L__unnamed_3,@object
.section .rodata..L__unnamed_3,"a",@progbits
.L__unnamed_3:
.ascii "src/lib.rs"
.size .L__unnamed_3, 10
.type .L__unnamed_2,@object
.section .data.rel.ro..L__unnamed_2,"aw",@progbits
.p2align 3
.L__unnamed_2:
.quad .L__unnamed_3
.asciz "\n\000\000\000\000\000\000\000\005\000\000\000B\000\000"
.size .L__unnamed_2, 24
.type .L__unnamed_1,@object
.section .data.rel.ro..L__unnamed_1,"aw",@progbits
.p2align 3
.L__unnamed_1:
.quad .L__unnamed_3
.asciz "\n\000\000\000\000\000\000\000\f\000\000\000\025\000\000"
.size .L__unnamed_1, 24
.section ".note.GNU-stack","",@progbits
cc @stephentyrone
Attachment: Download
Additional Detail from JIRA
| | | |------------------|-----------------| |Votes | 1 | |Component/s | Standard Library | |Labels | Bug, simd | |Assignee | None | |Priority | Medium | md5: c21b41dce5f7de3c531660d78015bffbIssue Description:
I recently investigated using the SIMD types to implement a generic SIMD vectorised string search. To get a feel for how well or poorly the SIMD types were doing I compared them against two other implementations: a pure Swift byte wise string search, and a C implementation directly using the Intel intrinsics. The expectation is that the C implementation is a "best case" and the SIMD one would be judged by how close it could get to that C implementation from the baseline of the byte wise search. This did not attempt to solve the problem in full generality, dealing with alignment etc, as I just wanted to investigate what the order of magnitude of the wins might be.
My benchmark package is attached.
Here are my results, searching a 1kB buffer for a byte that is in the last byte:
I was surprised by how poorly the SIMD implementation performed, so I extracted the common implementations into Godbolt. The idealised C implementation is here. The byte wise Swift implementation is here. The Swift SIMD implementation is here.
A part of the performance cost here is that there is no apparent way to transform a SIMDMask into a bitmask. This limits my ability to query it to checking every SIMD lane manually, which is quite expensive. I tried to work around this with a comparison against the zero mask, but that comparison also appears to be done lanewise, another strange performance issue.
Finally, I was a bit startled at exactly how much code was needed to initialise the SIMD32s from the UnsafeRawBufferPointer. While I didn't expect C's level of brevity, the amount of code involved here is so high that if I add even a few extra lines of code to the Swift function the SIMD32 initialiser gets outlined for code size reasons.
The result of all of this is that SIMD appears to be unusable for string processing tasks at this time, unless I've missed something substantial about how it's supposed to be used.