Open vfdff opened 2 months ago
@llvm/issue-subscribers-backend-aarch64
Author: Allen (vfdff)
Get better assemble it we change,
%arrayidx10 = getelementptr inbounds [10000 x %"class.std::__1::complex"], ptr @mdlComplex, i32 0, i32 %1 into %arrayidx10.idx = shl nsw i32 %1, 3 %arrayidx10 = getelementptr inbounds i8, ptr @mdlComplex, i32 %arrayidx10.idx
see detail on link https://gcc.godbolt.org/z/GrbsE7EaP
for.body8: ; preds = %for.cond5.preheader, %for.body8
%indvars.iv48 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next49, %for.body8 ]
%indvars.iv = phi i64 [ %0, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
%arrayidx = getelementptr inbounds [1000 x i32], ptr @indexarr, i64 0, i64 %indvars.iv
%1 = load i32, ptr %arrayidx, align 4, !tbaa !10
; %idxprom9 = sext i32 %1 to i64
; %arrayidx10 = getelementptr inbounds [10000 x %"class.std::__1::complex"], ptr @mdlComplex, i32 0, i32 %1
%arrayidx10.idx = shl nsw i32 %1, 3
%arrayidx10 = getelementptr inbounds i8, ptr @mdlComplex, i32 %arrayidx10.idx
%2 = load float, ptr %arrayidx10, align 4, !tbaa !14
%__im_.i = getelementptr inbounds nuw i8, ptr %arrayidx10, i32 4
%3 = load float, ptr %__im_.i, align 4, !tbaa !17
%arrayidx20 = getelementptr inbounds [16 x [256 x float]], ptr @s_ref_real, i64 0, i64 %indvars.iv53, i64 %indvars.iv48
store float %2, ptr %arrayidx20, align 4, !tbaa !18
%arrayidx25 = getelementptr inbounds [16 x [256 x float]], ptr @s_ref_imag, i64 0, i64 %indvars.iv53, i64 %indvars.iv48
store float %3, ptr %arrayidx25, align 4, !tbaa !18
%indvars.iv.next = add nsw i64 %indvars.iv, 1
%indvars.iv.next49 = add nuw nsw i64 %indvars.iv48, 1
%exitcond.not = icmp eq i64 %indvars.iv.next49, 256
br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8, !llvm.loop !19
much better with the following rewritten version, here are 3 ld1w compare to 5 ld1w for the prior input for the inner loop body, https://godbolt.org/z/sq1K9GEvG
__attribute__((noinline))
void foo(int sz)
{
float *PmdlComplex = (float *)mdlComplex;
for (int i = 0; i < eulers_per_block; i ++) {
#pragma clang loop vectorize(enable)
#pragma GCC ivdep
for (int tid=0; tid<sz; tid++){
int index = indexarr[tid];
// s_ref_real[i][tid] = mdlComplex[index].real();
// s_ref_imag[i][tid] = mdlComplex[index].imag();
s_ref_real[i][tid] = PmdlComplex[index *2 ];
s_ref_imag[i][tid] = PmdlComplex[index *2 + 1];
}
}
}
normal struct load case: https://godbolt.org/z/YezKP8YM4
related assemble code generated by llvm: It works fine as we load the both real and image part one time
gather struct load: https://godbolt.org/z/b5GoT4qqv
related assemble code generated by llvm: It double load the real and image parts