Open Quuxplusone opened 10 years ago
The function find is copied from libstdc++, and the function find2 is copied
from libc++.
I suppose find is faster than find2.
I appreciate the bug report; but pasting in libstdc++'s implementation is not helpful.
We can't use it in libc++, because the license is incompatible with libc++'s license.
Stealing this.
Attached find.patch
(2366 bytes, text/plain): Performance fix using duffs device
Attached find.patch
(6793 bytes, text/plain): Fix for find, find_if and find_if_not.
No evidence that 8 is the right number. I'll figure that out tommorow and before I put this up for review.
(In reply to comment #7)
> No evidence that 8 is the right number. I'll figure that out tommorow and
> before I put this up for review.
What's the status of this?
The Assembly generated for find and find2 with clang 3.8 -Ofast -std=c++14 -
march=native -DNDEBUG is:
int* find<int*, int const&>(int*, int*, int const& const&): # @int*
find<int*, int const&>(int*, int*, int const& const&)
mov rax, rsi
sub rax, rdi
shr rax, 4
test eax, eax
jle .LBB0_11
mov r8d, dword ptr [rdx]
add eax, 1
lea r9, [rdi + 8]
.LBB0_2: # =>This Inner Loop Header: Depth=1
mov rcx, r9
cmp dword ptr [rcx - 8], r8d
je .LBB0_23
cmp dword ptr [rcx - 4], r8d
je .LBB0_4
cmp dword ptr [rcx], r8d
je .LBB0_6
cmp dword ptr [rcx + 4], r8d
je .LBB0_8
add rdi, 16
add eax, -1
lea r9, [rcx + 16]
cmp eax, 1
jg .LBB0_2
add rcx, 8
mov rdi, rcx
.LBB0_11: # %._crit_edge
mov rax, rsi
sub rax, rdi
sar rax, 2
cmp rax, 3
je .LBB0_17
cmp rax, 2
je .LBB0_16
cmp rax, 1
jne .LBB0_14
mov eax, dword ptr [rdx]
jmp .LBB0_21
.LBB0_23: # %..loopexit.loopexit_crit_edge
add rcx, -8
mov rdi, rcx
jmp .LBB0_24
.LBB0_4:
add rdi, 4
mov rax, rdi
ret
.LBB0_6:
add rdi, 8
mov rax, rdi
ret
.LBB0_16: # %._crit_edge._crit_edge
mov eax, dword ptr [rdx]
jmp .LBB0_19
.LBB0_8:
add rdi, 12
mov rax, rdi
ret
.LBB0_17:
mov eax, dword ptr [rdx]
cmp dword ptr [rdi], eax
je .LBB0_24
add rdi, 4
.LBB0_19:
cmp dword ptr [rdi], eax
je .LBB0_24
add rdi, 4
.LBB0_21:
cmp dword ptr [rdi], eax
jne .LBB0_22
.LBB0_24: # %.loopexit
mov rax, rdi
ret
.LBB0_14:
mov rdi, rsi
mov rax, rdi
ret
.LBB0_22:
mov rdi, rsi
mov rax, rdi
ret
int* find2<int*, int const&>(int*, int*, int const& const&): # @int*
find2<int*, int const&>(int*, int*, int const& const&)
cmp rdi, rsi
je .LBB1_5
mov eax, dword ptr [rdx]
.LBB1_2: # =>This Inner Loop Header: Depth=1
cmp dword ptr [rdi], eax
je .LBB1_5
add rdi, 4
cmp rsi, rdi
jne .LBB1_2
mov rdi, rsi
.LBB1_5: # %._crit_edge
mov rax, rdi
ret
The assembly generated by gcc 6.1 is:
int* find<int*, int const&>(int*, int*, int const& const&):
mov rax, rsi
sub rax, rdi
mov rcx, rax
sar rax, 4
sar rcx, 2
mov r8d, eax
test eax, eax
jle .L16
mov ecx, DWORD PTR [rdx]
cmp ecx, DWORD PTR [rdi]
je .L17
cmp ecx, DWORD PTR [rdi+4]
je .L26
cmp ecx, DWORD PTR [rdi+8]
je .L27
cmp DWORD PTR [rdi+12], ecx
jne .L21
jmp .L28
.L10:
cmp DWORD PTR [rdi+16], ecx
je .L23
cmp DWORD PTR [rax+4], ecx
je .L29
cmp DWORD PTR [rax+8], ecx
je .L30
cmp DWORD PTR [rax+12], ecx
je .L31
mov rdi, rax
.L21:
lea rax, [rdi+16]
sub r8d, 1
jne .L10
mov rcx, rsi
sub rcx, rax
sar rcx, 2
.L2:
cmp rcx, 2
je .L11
cmp rcx, 3
je .L12
cmp rcx, 1
je .L32
mov rax, rsi
.L23:
ret
.L29:
lea rax, [rdi+20]
ret
.L30:
lea rax, [rdi+24]
ret
.L31:
lea rax, [rdi+28]
ret
.L32:
mov edx, DWORD PTR [rdx]
.L15:
cmp DWORD PTR [rax], edx
cmovne rax, rsi
ret
.L12:
mov edx, DWORD PTR [rdx]
cmp DWORD PTR [rax], edx
je .L23
add rax, 4
jmp .L14
.L11:
mov edx, DWORD PTR [rdx]
.L14:
cmp DWORD PTR [rax], edx
je .L23
add rax, 4
jmp .L15
.L17:
mov rax, rdi
ret
.L16:
mov rax, rdi
jmp .L2
.L28:
lea rax, [rdi+12]
ret
.L27:
lea rax, [rdi+8]
ret
.L26:
lea rax, [rdi+4]
ret
int* find2<int*, int const&>(int*, int*, int const& const&):
mov rax, rsi
cmp rdi, rsi
je .L40
mov edx, DWORD PTR [rdx]
cmp edx, DWORD PTR [rdi]
jne .L36
jmp .L39
.L37:
cmp DWORD PTR [rdi], edx
je .L39
.L36:
add rdi, 4
cmp rax, rdi
jne .L37
ret
.L39:
mov rax, rdi
.L40:
ret
The code for find2 is tighter, the code for find might have less data
dependencies and might be a bit faster. In my opinion, however, the assembly
generated for both by both clang and gcc is horrible. I am compiling with -
Ofast and -march=native on an AVX machine and I would expect to see some
vectorized version like this:
$LL56@main:
movdqa xmm2, xmm4
movdqa xmm0, xmm4
movdqa xmm1, xmm4
lea rcx, QWORD PTR [rcx+64]
pcmpgtd xmm0, XMMWORD PTR [rcx-80]
pcmpgtd xmm2, XMMWORD PTR [rcx-96]
pcmpgtd xmm1, XMMWORD PTR [rcx-48]
paddd xmm2, xmm0
movdqa xmm0, xmm4
pcmpgtd xmm0, XMMWORD PTR [rcx-64]
paddd xmm1, xmm0
paddd xmm2, xmm1
psubd xmm3, xmm2
dec r8
jne SHORT $LL56@main
$LN54@main:
phaddd xmm3, xmm3
inc rdx
phaddd xmm3, xmm3
pextrw eax, xmm3, 0
which was taken from this SO answer
(http://stackoverflow.com/a/31509388/1422197).
Since that SSE2 version should be 7-8x faster than libstdc++ version, that (or
its AVX equivalent) should be the objective, and not trying to match libstdc++
implementation. [0]
I tried making the auto-vectorizer to trigger on find2 for raw pointer types
with __builtin_assume_aligned (for begin and end), __restrict (for begin and
end), and __builtin_assume(begin <= end). But nothing did work.
The question we should try to answer first is: is this worth solving in libc++
or should this be fixed in clang?
If this should be fixed in clang, I don't know if using Duff's device would
make things harder instead of easier.
If this should be fixed in libc++, we might want to go with Duff's device for
some cases, but we should be trying to use SIMD when possible (maybe adding
overloads for primitive types and using OpenMP's "#pragma loop SIMD" or
similar).
[0] there is another vectorized version here: something like in this SO answer:
http://stackoverflow.com/a/2743040/1422197
Just as an update, this should definitely be an LLVM bug I think. LLVM should
generate essentially perfect code here.
Currently, unrolling is *not* a problem. Top-of-tree for me unrolls by 64.
We should emit a vectorized version of this, but vectorizing this code is quite
challenging. Reasonable to file a bug against the vectorizer, but I wouldn't
expect it to get that any time soon.
Attached find2.cpp
(876 bytes, text/plain): find2.cpp: Shows how the loop is unrolled
Attached find.cpp
(2424 bytes, text/x-csrc): find.cpp: Shows a case where the loop is NOT unrolled
Reassigning to clang so that they can figure out why the loop doesn't get unrolled sometimes; no changes to libc++ are proposed.
clang unrolls by different factors with libc++ (5) and libstdc++ (8)
#include <algorithm>
#include <cstdlib>
using namespace std;
int arr[] = {1,2,3,4,5,6,7,8,9,10};
int BenchmarkLinearSearch(int n){
int r = std::rand() % n;
auto result = std::find(begin(arr), end(arr), arr[r]);
return *result;
}
find.patch
(2366 bytes, text/plain)find.patch
(6793 bytes, text/plain)find2.cpp
(876 bytes, text/plain)find.cpp
(2424 bytes, text/x-csrc)