wasted opportunities with array indexing patterns in AMDGPU backend

Epliz commented 2 years ago

Hi,

I have observed some suboptimal generated assembly for what seems like pretty common code patterns with Clang 14.0.0 coming from ROCM 5.0.2.

For this example kernel code:

__global__
void suboptimal_indexing(int const* a, int const* b, int const* c, int* d, size_t N, size_t C, uint32_t offset) {
  uint32_t globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;
  if (globalThreadId < N) {
    for (size_t i = 0; i < C; i++) {
      int r = 0;
      // access 0
      r += a[globalThreadId] + b[globalThreadId] + c[globalThreadId];
      globalThreadId += offset;
      // access 1
      r += a[globalThreadId] + b[globalThreadId] + c[globalThreadId];
      globalThreadId += offset;
      // access 2
      r += a[globalThreadId] + b[globalThreadId] + c[globalThreadId];
      globalThreadId += offset;
      // access 3
      r += a[globalThreadId] + b[globalThreadId] + c[globalThreadId];
      globalThreadId += offset;
      // access 4
      r += a[globalThreadId] + b[globalThreadId] + c[globalThreadId];
      globalThreadId += offset;
      // access 5
      r += a[globalThreadId] + b[globalThreadId] + c[globalThreadId];
      globalThreadId += offset;
      // access 6
      r += a[globalThreadId] + b[globalThreadId] + c[globalThreadId];
      globalThreadId += offset;
      // access 7
      r += a[globalThreadId] + b[globalThreadId] + c[globalThreadId];
      globalThreadId += offset;

      d[globalThreadId] = r;
    }
  }
}

The generated assembly (at -O3) is:


# __CLANG_OFFLOAD_BUNDLE____START__ hip-amdgcn-amd-amdhsa-gfx1031
    .text
    .amdgcn_target "amdgcn-amd-amdhsa--gfx1031"
    .protected  _Z19suboptimal_indexingPKiS0_S0_Pimmj ; -- Begin function _Z19suboptimal_indexingPKiS0_S0_Pimmj
    .globl  _Z19suboptimal_indexingPKiS0_S0_Pimmj
    .p2align    8
    .type   _Z19suboptimal_indexingPKiS0_S0_Pimmj,@function
_Z19suboptimal_indexingPKiS0_S0_Pimmj:  ; @_Z19suboptimal_indexingPKiS0_S0_Pimmj
; %bb.0:
    s_load_dword s0, s[4:5], 0x4
    s_load_dwordx4 s[12:15], s[6:7], 0x20
    v_mov_b32_e32 v1, 0
    s_waitcnt lgkmcnt(0)
    s_and_b32 s0, s0, 0xffff
    s_cmp_lg_u64 s[14:15], 0
    s_mul_i32 s8, s8, s0
    s_cselect_b32 s0, -1, 0
    v_add_nc_u32_e32 v0, s8, v0
    v_cmp_gt_u64_e32 vcc_lo, s[12:13], v[0:1]
    s_and_b32 s0, vcc_lo, s0
    s_and_saveexec_b32 s1, s0
    s_cbranch_execz BB0_3
; %bb.1:
    s_clause 0x1
    s_load_dword s8, s[6:7], 0x30
    s_load_dwordx8 s[0:7], s[6:7], 0x0
    v_mov_b32_e32 v1, 0
    s_waitcnt lgkmcnt(0)
    s_add_i32 s16, s8, s8
    s_lshl_b32 s9, s8, 3
    s_add_i32 s17, s16, s8
    s_mul_i32 s10, s8, 7
    s_add_i32 s17, s17, s8
    s_mul_i32 s11, s8, 6
    s_add_i32 s18, s17, s8
    s_mul_i32 s12, s8, 5
    s_lshl_b32 s13, s8, 2
    s_mul_i32 s16, s8, 3
    s_lshl_b32 s17, s8, 1
    s_add_i32 s18, s18, s8
BB0_2:                                  ; =>This Inner Loop Header: Depth=1
    v_lshlrev_b64 v[2:3], 2, v[0:1]
    v_add_nc_u32_e32 v4, s8, v0
    v_mov_b32_e32 v5, v1
    v_add_nc_u32_e32 v6, s17, v0
    v_mov_b32_e32 v7, v1
    v_add_nc_u32_e32 v8, s16, v0
    v_add_co_u32 v20, vcc_lo, s0, v2
    v_add_co_ci_u32_e32 v21, vcc_lo, s1, v3, vcc_lo
    v_add_co_u32 v22, vcc_lo, s2, v2
    v_lshlrev_b64 v[18:19], 2, v[4:5]
    v_add_co_ci_u32_e32 v23, vcc_lo, s3, v3, vcc_lo
    v_add_co_u32 v2, vcc_lo, s4, v2
    v_add_co_ci_u32_e32 v3, vcc_lo, s5, v3, vcc_lo
    global_load_dword v38, v[20:21], off
    global_load_dword v39, v[22:23], off
    global_load_dword v40, v[2:3], off
    v_add_co_u32 v2, vcc_lo, s0, v18
    v_mov_b32_e32 v9, v1
    v_add_nc_u32_e32 v10, s13, v0
    v_mov_b32_e32 v11, v1
    v_add_nc_u32_e32 v12, s12, v0
    v_mov_b32_e32 v13, v1
    v_add_nc_u32_e32 v14, s11, v0
    v_mov_b32_e32 v15, v1
    v_add_nc_u32_e32 v16, s10, v0
    v_mov_b32_e32 v17, v1
    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v19, vcc_lo
    v_lshlrev_b64 v[5:6], 2, v[6:7]
    v_add_co_u32 v20, vcc_lo, s2, v18
    v_add_co_ci_u32_e32 v21, vcc_lo, s3, v19, vcc_lo
    v_lshlrev_b64 v[7:8], 2, v[8:9]
    v_lshlrev_b64 v[9:10], 2, v[10:11]
    v_lshlrev_b64 v[11:12], 2, v[12:13]
    v_lshlrev_b64 v[13:14], 2, v[14:15]
    v_lshlrev_b64 v[15:16], 2, v[16:17]
    v_add_co_u32 v17, vcc_lo, s4, v18
    v_add_co_ci_u32_e32 v18, vcc_lo, s5, v19, vcc_lo
    v_add_co_u32 v22, vcc_lo, s0, v5
    v_add_co_ci_u32_e32 v23, vcc_lo, s1, v6, vcc_lo
    v_add_co_u32 v24, vcc_lo, s2, v5
    v_add_co_ci_u32_e32 v25, vcc_lo, s3, v6, vcc_lo
    v_add_co_u32 v5, vcc_lo, s4, v5
    v_add_co_ci_u32_e32 v6, vcc_lo, s5, v6, vcc_lo
    v_add_co_u32 v26, vcc_lo, s0, v7
    v_add_co_ci_u32_e32 v27, vcc_lo, s1, v8, vcc_lo
    v_add_co_u32 v28, vcc_lo, s2, v7
    v_add_co_ci_u32_e32 v29, vcc_lo, s3, v8, vcc_lo
    v_add_co_u32 v7, vcc_lo, s4, v7
    v_add_co_ci_u32_e32 v8, vcc_lo, s5, v8, vcc_lo
    v_add_co_u32 v30, vcc_lo, s0, v9
    global_load_dword v19, v[2:3], off
    v_add_co_ci_u32_e32 v31, vcc_lo, s1, v10, vcc_lo
    v_add_co_u32 v32, vcc_lo, s2, v9
    v_add_co_ci_u32_e32 v33, vcc_lo, s3, v10, vcc_lo
    v_add_co_u32 v9, vcc_lo, s4, v9
    global_load_dword v20, v[20:21], off
    global_load_dword v21, v[17:18], off
    v_add_co_ci_u32_e32 v10, vcc_lo, s5, v10, vcc_lo
    v_add_co_u32 v34, vcc_lo, s0, v11
    v_add_co_ci_u32_e32 v35, vcc_lo, s1, v12, vcc_lo
    global_load_dword v22, v[22:23], off
    global_load_dword v23, v[24:25], off
    v_add_co_u32 v36, vcc_lo, s2, v11
    v_add_co_ci_u32_e32 v37, vcc_lo, s3, v12, vcc_lo
    v_add_co_u32 v2, vcc_lo, s4, v11
    global_load_dword v24, v[5:6], off
    global_load_dword v25, v[26:27], off
    v_add_co_ci_u32_e32 v3, vcc_lo, s5, v12, vcc_lo
    v_add_co_u32 v11, vcc_lo, s0, v13
    v_add_co_ci_u32_e32 v12, vcc_lo, s1, v14, vcc_lo
    v_add_co_u32 v17, vcc_lo, s2, v13
    global_load_dword v26, v[28:29], off
    global_load_dword v27, v[7:8], off
    v_add_co_ci_u32_e32 v18, vcc_lo, s3, v14, vcc_lo
    v_add_co_u32 v5, vcc_lo, s4, v13
    v_add_co_ci_u32_e32 v6, vcc_lo, s5, v14, vcc_lo
    global_load_dword v13, v[30:31], off
    global_load_dword v14, v[32:33], off
    global_load_dword v28, v[9:10], off
    global_load_dword v29, v[34:35], off
    global_load_dword v30, v[36:37], off
    global_load_dword v31, v[2:3], off
    v_add_co_u32 v7, vcc_lo, s0, v15
    v_add_co_ci_u32_e32 v8, vcc_lo, s1, v16, vcc_lo
    v_add_co_u32 v9, vcc_lo, s2, v15
    global_load_dword v11, v[11:12], off
    global_load_dword v12, v[17:18], off
    v_add_co_ci_u32_e32 v10, vcc_lo, s3, v16, vcc_lo
    v_add_co_u32 v2, vcc_lo, s4, v15
    v_add_co_ci_u32_e32 v3, vcc_lo, s5, v16, vcc_lo
    global_load_dword v5, v[5:6], off
    global_load_dword v6, v[7:8], off
    global_load_dword v7, v[9:10], off
    global_load_dword v8, v[2:3], off
    v_add_nc_u32_e32 v0, s9, v0
    s_add_u32 s14, s14, -1
    s_addc_u32 s15, s15, -1
    s_cmp_lg_u64 s[14:15], 0
    s_waitcnt vmcnt(22)
    v_add_nc_u32_e32 v2, v39, v38
    s_waitcnt vmcnt(20)
    v_add3_u32 v2, v2, v40, v19
    s_waitcnt vmcnt(18)
    v_add3_u32 v2, v2, v20, v21
    s_waitcnt vmcnt(16)
    v_add3_u32 v2, v2, v22, v23
    s_waitcnt vmcnt(14)
    v_add3_u32 v2, v2, v24, v25
    s_waitcnt vmcnt(12)
    v_add3_u32 v2, v2, v26, v27
    s_waitcnt vmcnt(10)
    v_add3_u32 v2, v2, v13, v14
    s_waitcnt vmcnt(8)
    v_add3_u32 v2, v2, v28, v29
    s_waitcnt vmcnt(6)
    v_add3_u32 v2, v2, v30, v31
    s_waitcnt vmcnt(4)
    v_add3_u32 v9, v2, v11, v12
    v_lshlrev_b64 v[2:3], 2, v[0:1]
    v_add3_u32 v0, s18, s8, v4
    s_waitcnt vmcnt(2)
    v_add3_u32 v4, v9, v5, v6
    v_add_co_u32 v2, vcc_lo, s6, v2
    v_add_co_ci_u32_e32 v3, vcc_lo, s7, v3, vcc_lo
    s_waitcnt vmcnt(0)
    v_add3_u32 v4, v4, v7, v8
    global_store_dword v[2:3], v4, off
    s_cbranch_scc1 BB0_2
BB0_3:
    s_endpgm
    .section    .rodata,#alloc
    .p2align    6
    .amdhsa_kernel _Z19suboptimal_indexingPKiS0_S0_Pimmj
        .amdhsa_group_segment_fixed_size 0
        .amdhsa_private_segment_fixed_size 0
        .amdhsa_kernarg_size 112
        .amdhsa_user_sgpr_private_segment_buffer 1
        .amdhsa_user_sgpr_dispatch_ptr 1
        .amdhsa_user_sgpr_queue_ptr 0
        .amdhsa_user_sgpr_kernarg_segment_ptr 1
        .amdhsa_user_sgpr_dispatch_id 0
        .amdhsa_user_sgpr_flat_scratch_init 0
        .amdhsa_user_sgpr_private_segment_size 0
        .amdhsa_wavefront_size32 1
        .amdhsa_system_sgpr_private_segment_wavefront_offset 0
        .amdhsa_system_sgpr_workgroup_id_x 1
        .amdhsa_system_sgpr_workgroup_id_y 0
        .amdhsa_system_sgpr_workgroup_id_z 0
        .amdhsa_system_sgpr_workgroup_info 0
        .amdhsa_system_vgpr_workitem_id 0
        .amdhsa_next_free_vgpr 41
        .amdhsa_next_free_sgpr 19
        .amdhsa_reserve_flat_scratch 0
        .amdhsa_float_round_mode_32 0
        .amdhsa_float_round_mode_16_64 0
        .amdhsa_float_denorm_mode_32 3
        .amdhsa_float_denorm_mode_16_64 3
        .amdhsa_dx10_clamp 1
        .amdhsa_ieee_mode 1
        .amdhsa_fp16_overflow 0
        .amdhsa_workgroup_processor_mode 1
        .amdhsa_memory_ordered 1
        .amdhsa_forward_progress 0
        .amdhsa_exception_fp_ieee_invalid_op 0
        .amdhsa_exception_fp_denorm_src 0
        .amdhsa_exception_fp_ieee_div_zero 0
        .amdhsa_exception_fp_ieee_overflow 0
        .amdhsa_exception_fp_ieee_underflow 0
        .amdhsa_exception_fp_ieee_inexact 0
        .amdhsa_exception_int_div_zero 0
    .end_amdhsa_kernel
    .text
.Lfunc_end0:
    .size   _Z19suboptimal_indexingPKiS0_S0_Pimmj, .Lfunc_end0-_Z19suboptimal_indexingPKiS0_S0_Pimmj
                                        ; -- End function
    .section    .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 940
; NumSgprs: 21
; NumVgprs: 41
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 2
; VGPRBlocks: 5
; NumSGPRsForWavesPerEU: 21
; NumVGPRsForWavesPerEU: 41
; Occupancy: 16
; WaveLimiterHint : 1
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 8
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0

We can observe two issues with the code: 1) we use several vgprs for indexing the same array several times while we could just re-use the same vgpr-pair for the incremented address 2) incrementing the index with globalThreadId += offset; actually causes as many vgpr additions as there are accessed arrays while it could use the variant global_load_dword, vdest, voffset, saddr for the loads as the offset is a int32_t, which then would mean we would need to increment only the offset once for all arrays.

I think that both issues are quite a problem.

I hope that this can get fixed in a future release, if it has already been addressed, let me know and please disregard this.

Best regards, Epliz.

llvmbot commented 2 years ago

@llvm/issue-subscribers-backend-amdgpu

Epliz commented 2 years ago

For the second point, I guess the compiler can't do much as multiplying the 32 bit index with the type size might require a bigger type... Would it be possible to have intrinsics to have access to the base + offset versions of the load/store instructions?

If you have the opportunity, please ask the hardware engineers to add load/store instructions with addressing modes accepting 32/64 bit signed/unsigned offsets with strides of 1,2,4,8 .

Epliz commented 1 year ago

For anyone who might read this, it is possible to get the "nice" load/store instructions that use SGPR pair + VGPR offset to be used with code like the following:

template<typename T>
__device__
static T read(T const* a, uint32_t i) {
  const uint8_t* a_ = (const uint8_t*) a;
  // keeping byte offset in 32 bit unsigned
  // makes it possible to use SGPR-pair + VGPR offset load instruction
  uint32_t byte_off = i * sizeof(T);
  return *((const T*) (a_ + byte_off));
}

template<typename T>
__device__
static void write(T * a, uint32_t i, T val) {
  uint8_t* a_ = (uint8_t*) a;
  // keeping byte offset in 32 bit unsigned
  // makes it possible to use SGPR-pair + VGPR offset store instruction
  uint32_t byte_off = i * sizeof(T);
  *((T*) (a_ + byte_off)) = val;
}

__global__
void better_indexing(float const* a, float const* b, float const* c, float* d, size_t N, size_t C, uint32_t offset) {
  size_t blockStart = (blockIdx.x * blockDim.x) * C; // warp uniform value, will go in SGPR
  if (blockStart >= N) {
    return;
  }
  // C * warpSize assumed to fit in uint32_t
  uint32_t blockMax = (uint32_t) std::min(C * warpSize, N - blockStart);

  // align pointers
  const float* a_ = &a[blockStart]; // warp uniform value, will go in SGPR pair
  const float* b_ = &b[blockStart]; // warp uniform value, will go in SGPR pair
  const float* c_ = &c[blockStart]; // warp uniform value, will go in SGPR pair
  float* d_ = &d[blockStart]; // warp uniform value, will go in SGPR pair

  uint32_t subOffset = threadIdx.x;
  for (uint32_t i = 0; i < C; i++, subOffset += warpSize) {
    if (subOffset < blockMax) {
      float r = read(a_, subOffset) + read(b_, subOffset) + read(c_, subOffset);
      write(d_, subOffset, r);
    }
  }
}

The loop part gives the following nice assembly:

.LBB1_3:                                ;   in Loop: Header=BB1_4 Depth=1
    s_or_b64 exec, exec, s[12:13]
    v_mov_b32_e32 v2, s2
    v_mov_b32_e32 v3, s3
    v_cmp_lt_u64_e32 vcc, s[10:11], v[2:3]
    v_add_u32_e32 v0, 64, v0
    s_add_i32 s10, s10, 1
    v_add_u32_e32 v1, 0x100, v1
    s_and_b64 vcc, exec, vcc
    s_cbranch_vccz .LBB1_6
.LBB1_4:                                ; =>This Inner Loop Header: Depth=1
    v_cmp_gt_u32_e32 vcc, s14, v0
    s_and_saveexec_b64 s[12:13], vcc
    s_cbranch_execz .LBB1_3
; %bb.5:                                ;   in Loop: Header=BB1_4 Depth=1
    global_load_dword v2, v1, s[0:1]
    global_load_dword v3, v1, s[4:5]
    global_load_dword v4, v1, s[6:7]
    s_waitcnt vmcnt(1)
    v_add_f32_e32 v2, v2, v3
    s_waitcnt vmcnt(0)
    v_add_f32_e32 v2, v2, v4
    global_store_dword v1, v2, s[8:9]
    s_branch .LBB1_3

snikitav commented 9 months ago

@EugeneZelenko may I take this one?

EugeneZelenko commented 9 months ago

@snikitav: Will be good idea to ask @arsenm, who is most active developer of the backend.

arsenm commented 8 months ago

Would it be possible to have intrinsics to have access to the base + offset versions of the load/store instructions?

No, that's a maintenance headache. For the backend, and user code. The addressing modes change all the time. We do try to make use of these, but the addressing modes are less useful than you would hope since we need to prove lack of overflow to reassociate the expression into the forms they need.

llvm / llvm-project

wasted opportunities with array indexing patterns in AMDGPU backend #55314