[AArch64] gather struct load should be reused similar to normal struct load

vfdff commented 2 months ago

normal struct load case: https://godbolt.org/z/YezKP8YM4

for (int i = 0; i < eulers_per_block; i ++) {
#pragma clang loop vectorize(enable)
#pragma GCC ivdep
 for (int tid=0; tid<block_sz; tid++){
  int index = tid;
  s_ref_real[i][tid] = mdlComplex[index].real();
  s_ref_imag[i][tid] = mdlComplex[index].imag();
}
}

related assemble code generated by llvm: It works fine as we load the both real and image part one time

.LBB0_2:
    ld2w    { z0.s, z1.s }, p0/z, [x22]         
    add     x22, x22, x13
    st1w    { z0.s }, p0, [x10, x21, lsl #2]   # r0, r1, ... r7 (assme VScale=2)
    st1w    { z1.s }, p0, [x11, x21, lsl #2]   #i0, i1, .... i7
    add     x21, x21, x12
    cmp     x21, #256
    b.ne    .LBB0_2

gather struct load: https://godbolt.org/z/b5GoT4qqv

for (int i = 0; i < eulers_per_block; i ++) {
#pragma clang loop vectorize(enable)
#pragma GCC ivdep
 for (int tid=0; tid<block_sz; tid++){
  int index = indexarr[tid];
  s_ref_real[i][tid] = mdlComplex[index].real();
  s_ref_imag[i][tid] = mdlComplex[index].imag();
}
}

related assemble code generated by llvm: It double load the real and image parts

.LBB0_2:
    add     x22, x9, x21, lsl #2
    ld1sw   { z0.d }, p0/z, [x9, x21, lsl #2]     #[index.0, index.1], .... [index.6, index.7]  (assme VScale=2)
    ld1sw   { z2.d }, p0/z, [x22, #1, mul vl]   #[index.8, index.9], .... [index.14, index.15]
    add     x22, x10, #4                              
    lsl     z0.d, z0.d, #3
    lsl     z2.d, z2.d, #3
    ld1w    { z1.d }, p0/z, [x10, z0.d]          # [r0, i0], [r1,i1] ... [r3,i3]
    ld1w    { z3.d }, p0/z, [x10, z2.d]          # [r4, i4], [r5,i5] ... [r7,i7]
    uzp1    z1.s, z1.s, z3.s                          # r0, r1, r2, ... r7
    st1w    { z1.s }, p1, [x11, x21, lsl #2]
    ld1w    { z0.d }, p0/z, [x22, z0.d]          # [i0, r1], [i1,r2] ... [i3,r4]  , can reused with the above ld1w ?
    ld1w    { z1.d }, p0/z, [x22, z2.d]          # [i4, r5], [i5,r6] ... [i7,r8]
    uzp1    z0.s, z0.s, z1.s                          # i0, i1, i2, ... i7
    st1w    { z0.s }, p1, [x12, x21, lsl #2]
    add     x21, x21, x13
    cmp     x21, #256
    b.ne    .LBB0_2

llvmbot commented 2 months ago

@llvm/issue-subscribers-backend-aarch64

Author: Allen (vfdff)

* normal struct load case: https://godbolt.org/z/fGYKoM8W3 ``` for (int i = 0; i < eulers_per_block; i ++) { #pragma clang loop vectorize(enable) #pragma GCC ivdep for (int tid=0; tid<block_sz; tid++){ int index = tid; s_ref_real[i][tid] = mdlComplex[index].real(); s_ref_imag[i][tid] = mdlComplex[index].imag(); } } ``` * related assemble code generated by llvm: It works **fine** as we load the both **real** and **image** part one ``` .LBB0_2: ld2w { z0.s, z1.s }, p0/z, [x22] add x22, x22, x13 st1w { z0.s }, p0, [x10, x21, lsl #2] # r0, r1, ... r7 (assme VScale=2) st1w { z1.s }, p0, [x11, x21, lsl #2] #i0, i1, .... i7 add x21, x21, x12 cmp x21, #256 b.ne .LBB0_2 ``` * gather struct load: https://godbolt.org/z/b5GoT4qqv ``` for (int i = 0; i < eulers_per_block; i ++) { #pragma clang loop vectorize(enable) #pragma GCC ivdep for (int tid=0; tid<block_sz; tid++){ int index = indexarr[tid]; s_ref_real[i][tid] = mdlComplex[index].real(); s_ref_imag[i][tid] = mdlComplex[index].imag(); } } ``` * related assemble code generated by llvm: It **double load** the real and image parts ``` .LBB0_2: add x22, x9, x21, lsl #2 ld1sw { z0.d }, p0/z, [x9, x21, lsl #2] #index.0, index.1, .... index.7 (assme VScale=2) ld1sw { z2.d }, p0/z, [x22, #1, mul vl] #index.8, index.9, .... index.15 add x22, x10, #4 lsl z0.d, z0.d, #3 lsl z2.d, z2.d, #3 ld1w { z1.d }, p0/z, [x10, z0.d] # r0, i0, r1, ... i3 ld1w { z3.d }, p0/z, [x10, z2.d] # r4, i4, r5, ... i7 uzp1 z1.s, z1.s, z3.s # r0, r1, r2, ... r7 st1w { z1.s }, p1, [x11, x21, lsl #2] ld1w { z0.d }, p0/z, [x22, z0.d] # i0, r1, i1, ... i4 , can reused with the above ld1w ? ld1w { z1.d }, p0/z, [x22, z2.d] # i4, r5, i5, ... i8 uzp1 z0.s, z0.s, z1.s # i0, i1, i2, ... i7 st1w { z0.s }, p1, [x12, x21, lsl #2] add x21, x21, x13 cmp x21, #256 b.ne .LBB0_2 ```

vfdff commented 2 months ago

Get better assemble it we change,

%arrayidx10 = getelementptr inbounds [10000 x %"class.std::__1::complex"], ptr @mdlComplex, i32 0, i32 %1 into %arrayidx10.idx = shl nsw i32 %1, 3 %arrayidx10 = getelementptr inbounds i8, ptr @mdlComplex, i32 %arrayidx10.idx

see detail on link https://gcc.godbolt.org/z/GrbsE7EaP

for.body8:                                        ; preds = %for.cond5.preheader, %for.body8
%indvars.iv48 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next49, %for.body8 ]
%indvars.iv = phi i64 [ %0, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
%arrayidx = getelementptr inbounds [1000 x i32], ptr @indexarr, i64 0, i64 %indvars.iv
%1 = load i32, ptr %arrayidx, align 4, !tbaa !10
; %idxprom9 = sext i32 %1 to i64
; %arrayidx10 = getelementptr inbounds [10000 x %"class.std::__1::complex"], ptr @mdlComplex, i32 0, i32 %1
%arrayidx10.idx = shl nsw i32 %1, 3
%arrayidx10 = getelementptr inbounds i8, ptr @mdlComplex, i32 %arrayidx10.idx
%2 = load float, ptr %arrayidx10, align 4, !tbaa !14
%__im_.i = getelementptr inbounds nuw i8, ptr %arrayidx10, i32 4
%3 = load float, ptr %__im_.i, align 4, !tbaa !17
%arrayidx20 = getelementptr inbounds [16 x [256 x float]], ptr @s_ref_real, i64 0, i64 %indvars.iv53, i64 %indvars.iv48
store float %2, ptr %arrayidx20, align 4, !tbaa !18
%arrayidx25 = getelementptr inbounds [16 x [256 x float]], ptr @s_ref_imag, i64 0, i64 %indvars.iv53, i64 %indvars.iv48
store float %3, ptr %arrayidx25, align 4, !tbaa !18
%indvars.iv.next = add nsw i64 %indvars.iv, 1
%indvars.iv.next49 = add nuw nsw i64 %indvars.iv48, 1
%exitcond.not = icmp eq i64 %indvars.iv.next49, 256
br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8, !llvm.loop !19

vfdff commented 2 months ago

much better with the following rewritten version, here are 3 ld1w compare to 5 ld1w for the prior input for the inner loop body, https://godbolt.org/z/sq1K9GEvG

__attribute__((noinline))
void foo(int sz)
{
float *PmdlComplex = (float *)mdlComplex;

for (int i = 0; i < eulers_per_block; i ++) {
#pragma clang loop vectorize(enable)
#pragma GCC ivdep
 for (int tid=0; tid<sz; tid++){
  int index = indexarr[tid];
  // s_ref_real[i][tid] = mdlComplex[index].real();
  // s_ref_imag[i][tid] = mdlComplex[index].imag();
  s_ref_real[i][tid] = PmdlComplex[index *2 ];
  s_ref_imag[i][tid] = PmdlComplex[index *2 + 1];
}
}
}

llvm / llvm-project

[AArch64] gather struct load should be reused similar to normal struct load #107345