llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.57k stars 11.81k forks source link

Increment only one index to scan and operate on arrays in parallel #17097

Open llvmbot opened 11 years ago

llvmbot commented 11 years ago
Bugzilla Link 16723
Version trunk
OS Windows XP
Reporter LLVM Bugzilla Contributor

Extended Description

This is a potential enhancement request for the back-end.

This is D code, to be compiled with the ldc2 compiler (in D uint is a 32 bit unsigned integer):

void foo(in uint[] a, in uint[] b, in uint[] c, in uint[] d, in uint[] e, uint[] result) pure nothrow { foreach (immutable i; 0 .. result.length) result[i] = cast(uint)(a[i] + b[i] + c[i] + d[i] + e[i]); } void main() {}

(Compiled with ldc2 v.0.11.0 based on DMD v2.062 and LLVM 3.3svn)

ldmd2 -O -release -inline -noboundscheck -vectorize-loops -vectorize-slp -vectorize-slp-aggressive -output-s

For the loop of foo ldc2 gives: LBB0_2: movl %eax, (%esp) movl (%ebx), %eax addl (%ebp), %eax addl (%edi), %eax addl (%esi), %eax addl (%edx), %eax movl %eax, (%ecx) movl (%esp), %eax addl $4, %edx addl $4, %esi addl $4, %edi addl $4, %ebx addl $4, %ebp addl $4, %ecx decl %eax jne LBB0_2

This is a similar C99 program (D arrays are not just pointers, they are small structs of ponter-length, but the loop shouldn't be influenced by this):

include "stdio.h"

include "stdint.h"

void foo(size_t n, const uint32_t restrict a, const uint32_t restrict b, const uint32_t restrict c, const uint32_t restrict d, const uint32_t restrict e, uint32_t restrict result) { for (size_t i = 0; i < n; i++) result[i] = a[i] + b[i] + c[i] + d[i] + e[i]; } int main() { return 0; }

Compiled with: gcc -std=c99 -Ofast -S

gcc v.4.8.0 gives this for the loop of foo: L5: movl 40(%esp), %ecx movl (%edi,%edx,4), %eax addl 0(%ebp,%edx,4), %eax addl (%esi,%edx,4), %eax addl (%ebx,%edx,4), %eax addl (%ecx,%edx,4), %eax movl 44(%esp), %ecx movl %eax, (%ecx,%edx,4) addl $1, %edx cmpl 20(%esp), %edx jne L5

While for the loop of foo, Intel compiler V.13.0.1 (without restrict) gives: ... ..B2.4: # Preds ..B2.4 ..B2.3 movl (%rsi,%r10,8), %r11d #​12.21 addl (%rdx,%r10,8), %r11d #​12.28 movl (%r8,%r10,8), %ebp #​12.42 addl (%rcx,%r10,8), %r11d #​12.35 addl (%r9,%r10,8), %ebp #​12.49 addl %ebp, %r11d #​12.42 movl %r11d, (%rbx,%r10,8) #​12.9 movl 4(%rsi,%r10,8), %r11d #​12.21 addl 4(%rdx,%r10,8), %r11d #​12.28 movl 4(%r8,%r10,8), %ebp #​12.42 addl 4(%rcx,%r10,8), %r11d #​12.35 addl 4(%r9,%r10,8), %ebp #​12.49 addl %ebp, %r11d #​12.42 movl %r11d, 4(%rbx,%r10,8) #​12.9 incq %r10 #​11.5 cmpq %rax, %r10 #​11.5 jb ..B2.4 # Prob 63% #​11.5 ...

If I use only the a,b,c,d,result arrays, with the Intel compiler (without restrict), part of foo becomes: ... ..B3.25: # Preds ..B3.23 ..B3.25 movdqu (%rsi,%rax,4), %xmm2 #​19.28 movdqu (%rdx,%rax,4), %xmm0 #​19.28 lddqu (%rcx,%rax,4), %xmm1 #​19.35 paddd %xmm0, %xmm2 #​19.28 paddd %xmm1, %xmm2 #​19.35 movdqa %xmm2, (%r8,%rax,4) #​19.9 addq $4, %rax #​18.5 cmpq %r9, %rax #​18.5 jb ..B3.25 # Prob 82% #​18.5 ...

So in the end in this enhancement I suggest to explore the possibility and performance of replacing increment instructions like: addl $4, %edx addl $4, %esi addl $4, %edi addl $4, %ebx addl $4, %ebp addl $4, %ecx

with a single counter increment, and then use instructions like this to read and write arrays memory: movl (%edi,%edx,4), %eax

(Another potential improvement is to use instructions like paddd that sum four uint at a time and add after the loop a little extra serial code for situations where the input array lengths are not multiple of 4. But this is an unrelated and more complex enhancement).

llvmbot commented 11 years ago

Someone has suggested me to add the IR to this bug report, same compilation switches as before:

define x86_stdcallcc void @"\01D4test3fooFNaNbxAkxAkxAkxAkxAkAkZv"({ i32, i32 } %result_arg, { i32, i32 } %e_arg, { i32, i32 } %d_arg, { i32, i32 } %c_arg, { i32, i32 } %b_arg, { i32, i32 } %a_arg) { entry: %a = alloca { i32, i32 } %b = alloca { i32, i32 } %c = alloca { i32, i32 } %d = alloca { i32, i32 } %e = alloca { i32, i32 } %result = alloca { i32, i32 } %key5 = alloca i32, align 4 %limit6 = alloca i32, align 4 %i = alloca i32, align 4 store { i32, i32 } %a_arg, { i32, i32 } %a store { i32, i32 } %b_arg, { i32, i32 } %b store { i32, i32 } %c_arg, { i32, i32 } %c store { i32, i32 } %d_arg, { i32, i32 } %d store { i32, i32 } %e_arg, { i32, i32 } %e store { i32, i32 } %result_arg, { i32, i32 } %result store i32 0, i32 %key5 %tmp = load i32 %__key5 %tmp1 = getelementptr { i32, i32 } %result, i32 0, i32 0 %.len = load i32 %tmp1 store i32 %.len, i32 %__limit6 %tmp2 = load i32 %__limit6 br label %forcond

forcond: ; preds = %forinc, %entry %tmp3 = load i32 %__key5 %tmp4 = load i32 %__limit6 %tmp5 = icmp ult i32 %tmp3, %tmp4 br i1 %tmp5, label %forbody, label %endfor

forbody: ; preds = %forcond store i32* %__key5, i32 %i %tmp6 = getelementptr { i32, i32 } %result, i32 0, i32 1 %.ptr = load i32 %tmp6 %tmp7 = load i32 %i %tmp8 = load i32 %tmp7 %tmp9 = getelementptr i32 %.ptr, i32 %tmp8 %tmp10 = getelementptr { i32, i32 } %a, i32 0, i32 1 %.ptr11 = load i32 %tmp10 %tmp12 = load i32 %i %tmp13 = load i32 %tmp12 %tmp14 = getelementptr i32 %.ptr11, i32 %tmp13 %tmp15 = getelementptr { i32, i32 } %b, i32 0, i32 1 %.ptr16 = load i32 %tmp15 %tmp17 = load i32 %i %tmp18 = load i32 %tmp17 %tmp19 = getelementptr i32 %.ptr16, i32 %tmp18 %tmp20 = load i32 %tmp14 %tmp21 = load i32 %tmp19 %tmp22 = add i32 %tmp20, %tmp21 %tmp23 = getelementptr { i32, i32 } %c, i32 0, i32 1 %.ptr24 = load i32 %tmp23 %tmp25 = load i32 %i %tmp26 = load i32 %tmp25 %tmp27 = getelementptr i32 %.ptr24, i32 %tmp26 %tmp28 = load i32 %tmp27 %tmp29 = add i32 %tmp22, %tmp28 %tmp30 = getelementptr { i32, i32 } %d, i32 0, i32 1 %.ptr31 = load i32 %tmp30 %tmp32 = load i32 %i %tmp33 = load i32 %tmp32 %tmp34 = getelementptr i32 %.ptr31, i32 %tmp33 %tmp35 = load i32 %tmp34 %tmp36 = add i32 %tmp29, %tmp35 %tmp37 = getelementptr { i32, i32 } %e, i32 0, i32 1 %.ptr38 = load i32 %tmp37 %tmp39 = load i32* %i %tmp40 = load i32 %tmp39 %tmp41 = getelementptr i32 %.ptr38, i32 %tmp40 %tmp42 = load i32 %tmp41 %tmp43 = add i32 %tmp36, %tmp42 store i32 %tmp43, i32* %tmp9 br label %forinc

forinc: ; preds = %forbody %tmp44 = load i32 %__key5 %tmp45 = add i32 %tmp44, 1 store i32 %tmp45, i32 %__key5 br label %forcond

endfor: ; preds = %forcond ret void }