Open llvmbot opened 11 years ago
Someone has suggested me to add the IR to this bug report, same compilation switches as before:
define x86_stdcallcc void @"\01D4test3fooFNaNbxAkxAkxAkxAkxAkAkZv"({ i32, i32 } %result_arg, { i32, i32 } %e_arg, { i32, i32 } %d_arg, { i32, i32 } %c_arg, { i32, i32 } %b_arg, { i32, i32 } %a_arg) { entry: %a = alloca { i32, i32 } %b = alloca { i32, i32 } %c = alloca { i32, i32 } %d = alloca { i32, i32 } %e = alloca { i32, i32 } %result = alloca { i32, i32 } %key5 = alloca i32, align 4 %limit6 = alloca i32, align 4 %i = alloca i32, align 4 store { i32, i32 } %a_arg, { i32, i32 } %a store { i32, i32 } %b_arg, { i32, i32 } %b store { i32, i32 } %c_arg, { i32, i32 } %c store { i32, i32 } %d_arg, { i32, i32 } %d store { i32, i32 } %e_arg, { i32, i32 } %e store { i32, i32 } %result_arg, { i32, i32 } %result store i32 0, i32 %key5 %tmp = load i32 %__key5 %tmp1 = getelementptr { i32, i32 } %result, i32 0, i32 0 %.len = load i32 %tmp1 store i32 %.len, i32 %__limit6 %tmp2 = load i32 %__limit6 br label %forcond
forcond: ; preds = %forinc, %entry %tmp3 = load i32 %__key5 %tmp4 = load i32 %__limit6 %tmp5 = icmp ult i32 %tmp3, %tmp4 br i1 %tmp5, label %forbody, label %endfor
forbody: ; preds = %forcond store i32* %__key5, i32 %i %tmp6 = getelementptr { i32, i32 } %result, i32 0, i32 1 %.ptr = load i32 %tmp6 %tmp7 = load i32 %i %tmp8 = load i32 %tmp7 %tmp9 = getelementptr i32 %.ptr, i32 %tmp8 %tmp10 = getelementptr { i32, i32 } %a, i32 0, i32 1 %.ptr11 = load i32 %tmp10 %tmp12 = load i32 %i %tmp13 = load i32 %tmp12 %tmp14 = getelementptr i32 %.ptr11, i32 %tmp13 %tmp15 = getelementptr { i32, i32 } %b, i32 0, i32 1 %.ptr16 = load i32 %tmp15 %tmp17 = load i32 %i %tmp18 = load i32 %tmp17 %tmp19 = getelementptr i32 %.ptr16, i32 %tmp18 %tmp20 = load i32 %tmp14 %tmp21 = load i32 %tmp19 %tmp22 = add i32 %tmp20, %tmp21 %tmp23 = getelementptr { i32, i32 } %c, i32 0, i32 1 %.ptr24 = load i32 %tmp23 %tmp25 = load i32 %i %tmp26 = load i32 %tmp25 %tmp27 = getelementptr i32 %.ptr24, i32 %tmp26 %tmp28 = load i32 %tmp27 %tmp29 = add i32 %tmp22, %tmp28 %tmp30 = getelementptr { i32, i32 } %d, i32 0, i32 1 %.ptr31 = load i32 %tmp30 %tmp32 = load i32 %i %tmp33 = load i32 %tmp32 %tmp34 = getelementptr i32 %.ptr31, i32 %tmp33 %tmp35 = load i32 %tmp34 %tmp36 = add i32 %tmp29, %tmp35 %tmp37 = getelementptr { i32, i32 } %e, i32 0, i32 1 %.ptr38 = load i32 %tmp37 %tmp39 = load i32* %i %tmp40 = load i32 %tmp39 %tmp41 = getelementptr i32 %.ptr38, i32 %tmp40 %tmp42 = load i32 %tmp41 %tmp43 = add i32 %tmp36, %tmp42 store i32 %tmp43, i32* %tmp9 br label %forinc
forinc: ; preds = %forbody %tmp44 = load i32 %__key5 %tmp45 = add i32 %tmp44, 1 store i32 %tmp45, i32 %__key5 br label %forcond
endfor: ; preds = %forcond ret void }
Extended Description
This is a potential enhancement request for the back-end.
This is D code, to be compiled with the ldc2 compiler (in D uint is a 32 bit unsigned integer):
void foo(in uint[] a, in uint[] b, in uint[] c, in uint[] d, in uint[] e, uint[] result) pure nothrow { foreach (immutable i; 0 .. result.length) result[i] = cast(uint)(a[i] + b[i] + c[i] + d[i] + e[i]); } void main() {}
(Compiled with ldc2 v.0.11.0 based on DMD v2.062 and LLVM 3.3svn)
ldmd2 -O -release -inline -noboundscheck -vectorize-loops -vectorize-slp -vectorize-slp-aggressive -output-s
For the loop of foo ldc2 gives: LBB0_2: movl %eax, (%esp) movl (%ebx), %eax addl (%ebp), %eax addl (%edi), %eax addl (%esi), %eax addl (%edx), %eax movl %eax, (%ecx) movl (%esp), %eax addl $4, %edx addl $4, %esi addl $4, %edi addl $4, %ebx addl $4, %ebp addl $4, %ecx decl %eax jne LBB0_2
This is a similar C99 program (D arrays are not just pointers, they are small structs of ponter-length, but the loop shouldn't be influenced by this):
include "stdio.h"
include "stdint.h"
void foo(size_t n, const uint32_t restrict a, const uint32_t restrict b, const uint32_t restrict c, const uint32_t restrict d, const uint32_t restrict e, uint32_t restrict result) { for (size_t i = 0; i < n; i++) result[i] = a[i] + b[i] + c[i] + d[i] + e[i]; } int main() { return 0; }
Compiled with: gcc -std=c99 -Ofast -S
gcc v.4.8.0 gives this for the loop of foo: L5: movl 40(%esp), %ecx movl (%edi,%edx,4), %eax addl 0(%ebp,%edx,4), %eax addl (%esi,%edx,4), %eax addl (%ebx,%edx,4), %eax addl (%ecx,%edx,4), %eax movl 44(%esp), %ecx movl %eax, (%ecx,%edx,4) addl $1, %edx cmpl 20(%esp), %edx jne L5
While for the loop of foo, Intel compiler V.13.0.1 (without restrict) gives: ... ..B2.4: # Preds ..B2.4 ..B2.3 movl (%rsi,%r10,8), %r11d #12.21 addl (%rdx,%r10,8), %r11d #12.28 movl (%r8,%r10,8), %ebp #12.42 addl (%rcx,%r10,8), %r11d #12.35 addl (%r9,%r10,8), %ebp #12.49 addl %ebp, %r11d #12.42 movl %r11d, (%rbx,%r10,8) #12.9 movl 4(%rsi,%r10,8), %r11d #12.21 addl 4(%rdx,%r10,8), %r11d #12.28 movl 4(%r8,%r10,8), %ebp #12.42 addl 4(%rcx,%r10,8), %r11d #12.35 addl 4(%r9,%r10,8), %ebp #12.49 addl %ebp, %r11d #12.42 movl %r11d, 4(%rbx,%r10,8) #12.9 incq %r10 #11.5 cmpq %rax, %r10 #11.5 jb ..B2.4 # Prob 63% #11.5 ...
If I use only the a,b,c,d,result arrays, with the Intel compiler (without restrict), part of foo becomes: ... ..B3.25: # Preds ..B3.23 ..B3.25 movdqu (%rsi,%rax,4), %xmm2 #19.28 movdqu (%rdx,%rax,4), %xmm0 #19.28 lddqu (%rcx,%rax,4), %xmm1 #19.35 paddd %xmm0, %xmm2 #19.28 paddd %xmm1, %xmm2 #19.35 movdqa %xmm2, (%r8,%rax,4) #19.9 addq $4, %rax #18.5 cmpq %r9, %rax #18.5 jb ..B3.25 # Prob 82% #18.5 ...
So in the end in this enhancement I suggest to explore the possibility and performance of replacing increment instructions like: addl $4, %edx addl $4, %esi addl $4, %edi addl $4, %ebx addl $4, %ebp addl $4, %ecx
with a single counter increment, and then use instructions like this to read and write arrays memory: movl (%edi,%edx,4), %eax
(Another potential improvement is to use instructions like paddd that sum four uint at a time and add after the loop a little extra serial code for situations where the input array lengths are not multiple of 4. But this is an unrelated and more complex enhancement).