mratsim / constantine

Constantine: modular, high-performance, zero-dependency cryptography stack for verifiable computation, proof systems and blockchain protocols.
Other
408 stars 43 forks source link

Tracking compiler inefficiencies #357

Open mratsim opened 9 months ago

mratsim commented 9 months ago

As mentioned in https://github.com/mratsim/constantine/blob/661a481/README-PERFORMANCE.md#compiler-caveats Compilers have a hard time optimizing bigint operations, even as simple as an addition with carries.

This issue track their evolution and the quality of the code generated with compiler builtins for ISAs of interest.

Note that as of February 2024, we use:

2019, GCC 9.2 and Clang 9.0

The original problem: https://gcc.godbolt.org/z/2h768y

image

Even with intrinsics, an operation as simple as addition-with-carry is uglily implemented in GCC. This has been mentioned by the GMP folks 30 years ago: https://gmplib.org/manual/Assembly-Carry-Propagation.html

2024, GCC 13.2 and Clang 17.0

https://gcc.godbolt.org/z/jdecvffaP

image

GMP fixed the x86 intrinsics but unfortunately the portable intrinsics has a terribad codegen and hence makes a terrible fallback for ARM.

Current status

Due to GCC abysmal builtinaddcll it is a non-starter. Clang has decent codegen.

Assembly is still very much needed.

This also explains the bad ARM performance on Apple M1, M2, M3, mentioned by @agnxsh (https://github.com/mratsim/constantine/pull/354#issuecomment-1925688995) and @bkomuves

mratsim commented 4 months ago

According to https://stackoverflow.com/questions/33690791/producing-good-add-with-carry-code-from-clang#comment129749304_73472447, __builtin__addcll is useful on Clang since Clang 10 in 2020.

Fortunately, Clang 10 was released in March 2020, just prior to Ubuntu LTS 20.04, and its the default compiler there:

And Apple M1 or later which have the largest number of ARM users is from November 2020 so should also have proper support (unless Apple didn't merge all upstream improvements)

chfast commented 4 months ago

The __builint_addcll has been added in GCC 14 because of the _BitInt. Your example of GCC 13 compiles only because C language allows undefined functions. It's good to enable some warnings at least -Wall -Wextra.

Here are examples for GCC 14: https://gcc.godbolt.org/z/jx5E3MKhz. I couldn't make _BitInt(256) work on clang/ARM. To be investigated.

mratsim commented 3 months ago

Some experiments from writing a direct LLVM IR codegenerator that doesn't go through C:

mratsim commented 3 months ago

Using the following we can get:

bitwidth x86_64 ARM64
i256 optimal 1 extra instruction
i320 1 extra instruction 2 extra instructions

Now on AMDGPU I'm unsure, it doesn't seem like addition-with-carry are chaining properly

https://alive2.llvm.org/ce/z/qiNxP7

Original IR

; ModuleID = 'x86_poc'
; target triple = "arm64"
; target triple = "x86_64"
target triple = "amdgcn-amd-amdhsa"

@bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@bls12_381_fp_mod = constant i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787, section "ctt.bls12_381_fp.constants", align 64
@bls24_317_fp_mod = constant i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051, section "ctt.bls24_317_fp.constants", align 64

; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %a_plus_b = add i256 %a, %b
  %5 = sub i256 %a_plus_b, %M
  %6 = lshr i256 %5, 255
  %7 = trunc i256 %6 to i1
  %8 = select i1 %7, i256 %a_plus_b, i256 %5
  store i256 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
  %a = load i320, ptr %1, align 4
  %b = load i320, ptr %2, align 4
  %M = load i320, ptr %3, align 4
  %a_plus_b = add i320 %a, %b
  %5 = sub i320 %a_plus_b, %M
  %6 = lshr i320 %5, 319
  %7 = trunc i320 %6 to i1
  %8 = select i1 %7, i320 %a_plus_b, i320 %5
  store i320 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
  %a = load i384, ptr %1, align 4
  %b = load i384, ptr %2, align 4
  %M = load i384, ptr %3, align 4
  %a_plus_b = add i384 %a, %b
  %5 = sub i384 %a_plus_b, %M
  %6 = lshr i384 %5, 383
  %7 = trunc i384 %6 to i1
  %8 = select i1 %7, i384 %a_plus_b, i384 %5
  store i384 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define void @bn254_snarks_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bn254_snarks_fp" {
  call fastcc void @_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr @bn254_snarks_fp_mod)
  ret void
}

; Function Attrs: hot
define void @bls24_317_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bls24_317_fp" {
  call fastcc void @_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr @bls24_317_fp_mod)
  ret void
}

; Function Attrs: hot
define void @bls12_381_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bls12_381_fp" {
  call fastcc void @_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr @bls12_381_fp_mod)
  ret void
}

attributes #2 = { hot }

after opt -O3

target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"

define void @bn254_snarks_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bn254_snarks_fp" {
  %.val = load i256, ptr %1, align 4
  %.val1 = load i256, ptr %2, align 4
  %a_plus_b.i = add i256 %.val1, %.val
  %4 = add i256 %a_plus_b.i, -21888242871839275222246405745257275088696311157297823662689037894645226208583
  %.not1.i = icmp slt i256 %4, 0
  %5 = select i1 %.not1.i, i256 %a_plus_b.i, i256 %4
  store i256 %5, ptr %0, align 4
  ret void
}

define void @bls24_317_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bls24_317_fp" {
  %.val = load i320, ptr %1, align 4
  %.val1 = load i320, ptr %2, align 4
  %a_plus_b.i = add i320 %.val1, %.val
  %4 = add i320 %a_plus_b.i, -136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051
  %.not1.i = icmp slt i320 %4, 0
  %5 = select i1 %.not1.i, i320 %a_plus_b.i, i320 %4
  store i320 %5, ptr %0, align 4
  ret void
}

define void @bls12_381_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bls12_381_fp" {
  %.val = load i384, ptr %1, align 4
  %.val1 = load i384, ptr %2, align 4
  %a_plus_b.i = add i384 %.val1, %.val
  %4 = add i384 %a_plus_b.i, -4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787
  %.not1.i = icmp slt i384 %4, 0
  %5 = select i1 %.not1.i, i384 %a_plus_b.i, i384 %4
  store i384 %5, ptr %0, align 4
  ret void
}

attributes #0 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
bn254_snarks_fp_add:                    ; @bn254_snarks_fp_add
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        flat_load_dword v14, v[2:3]
        v_add_i32_e32 v15, vcc, 28, v2
        v_addc_u32_e32 v16, vcc, 0, v3, vcc
        v_add_i32_e32 v17, vcc, 24, v2
        v_addc_u32_e32 v18, vcc, 0, v3, vcc
        v_add_i32_e32 v19, vcc, 20, v2
        v_addc_u32_e32 v20, vcc, 0, v3, vcc
        v_add_i32_e32 v21, vcc, 16, v2
        v_addc_u32_e32 v22, vcc, 0, v3, vcc
        v_add_i32_e32 v23, vcc, 12, v2
        v_addc_u32_e32 v24, vcc, 0, v3, vcc
        v_add_i32_e32 v25, vcc, 8, v2
        v_addc_u32_e32 v26, vcc, 0, v3, vcc
        v_add_i32_e32 v27, vcc, 4, v2
        v_addc_u32_e32 v28, vcc, 0, v3, vcc
        v_add_i32_e32 v29, vcc, 28, v4
        v_addc_u32_e32 v30, vcc, 0, v5, vcc
        v_add_i32_e32 v31, vcc, 24, v4
        v_addc_u32_e32 v32, vcc, 0, v5, vcc
        v_add_i32_e32 v8, vcc, 20, v4
        v_addc_u32_e32 v9, vcc, 0, v5, vcc
        v_add_i32_e32 v12, vcc, 16, v4
        v_addc_u32_e32 v13, vcc, 0, v5, vcc
        v_add_i32_e32 v2, vcc, 12, v4
        v_addc_u32_e32 v3, vcc, 0, v5, vcc
        v_add_i32_e32 v6, vcc, 8, v4
        v_addc_u32_e32 v7, vcc, 0, v5, vcc
        v_add_i32_e32 v10, vcc, 4, v4
        v_addc_u32_e32 v11, vcc, 0, v5, vcc
        flat_load_dword v5, v[4:5]
        flat_load_dword v4, v[15:16]
        v_mov_b32_e32 v33, 0xc3df73e9
        flat_load_dword v17, v[17:18]
        v_mov_b32_e32 v18, 0x978e3572
        flat_load_dword v19, v[19:20]
        v_mov_b32_e32 v20, 0x687e956e
        flat_load_dword v21, v[21:22]
        v_mov_b32_e32 v22, 0x7e7ea7a2
        flat_load_dword v23, v[23:24]
        v_mov_b32_e32 v24, 0x47afba49
        flat_load_dword v25, v[25:26]
        v_mov_b32_e32 v26, 0x1ece5fd6
        flat_load_dword v27, v[27:28]
        v_mov_b32_e32 v28, 0xcf9bb18d
        flat_load_dword v29, v[29:30]
        flat_load_dword v30, v[31:32]
        v_add_i32_e32 v15, vcc, 4, v0
        v_addc_u32_e32 v16, vcc, 0, v1, vcc
        flat_load_dword v31, v[8:9]
        flat_load_dword v12, v[12:13]
        v_add_i32_e32 v8, vcc, 8, v0
        v_addc_u32_e32 v9, vcc, 0, v1, vcc
        flat_load_dword v13, v[2:3]
        flat_load_dword v7, v[6:7]
        v_add_i32_e32 v2, vcc, 12, v0
        v_addc_u32_e32 v3, vcc, 0, v1, vcc
        flat_load_dword v10, v[10:11]
        s_waitcnt vmcnt(0) lgkmcnt(0)
        v_add_i32_e32 v14, vcc, v5, v14
        v_add_i32_e64 v5, s[4:5], 16, v0
        v_addc_u32_e64 v6, s[4:5], 0, v1, s[4:5]
        v_addc_u32_e32 v27, vcc, v10, v27, vcc
        v_addc_u32_e32 v7, vcc, v7, v25, vcc
        v_add_i32_e64 v10, s[4:5], 20, v0
        v_addc_u32_e64 v11, s[4:5], 0, v1, s[4:5]
        v_addc_u32_e32 v23, vcc, v13, v23, vcc
        v_addc_u32_e32 v21, vcc, v12, v21, vcc
        v_add_i32_e64 v12, s[4:5], 24, v0
        v_addc_u32_e64 v13, s[4:5], 0, v1, s[4:5]
        v_addc_u32_e32 v25, vcc, v31, v19, vcc
        v_addc_u32_e32 v30, vcc, v30, v17, vcc
        v_addc_u32_e32 v4, vcc, v29, v4, vcc
        v_add_i32_e32 v29, vcc, 0x278302b9, v14
        v_addc_u32_e32 v31, vcc, v27, v33, vcc
        v_addc_u32_e32 v32, vcc, v7, v18, vcc
        v_addc_u32_e32 v33, vcc, v23, v20, vcc
        v_addc_u32_e32 v22, vcc, v21, v22, vcc
        v_addc_u32_e32 v24, vcc, v25, v24, vcc
        v_addc_u32_e32 v17, vcc, v30, v26, vcc
        v_addc_u32_e32 v18, vcc, v4, v28, vcc
        v_add_i32_e32 v19, vcc, 28, v0
        v_addc_u32_e32 v20, vcc, 0, v1, vcc
        v_cmp_gt_i64_e32 vcc, 0, v[17:18]
        v_cndmask_b32_e32 v26, v31, v27, vcc
        flat_store_dword v[15:16], v26
        v_cndmask_b32_e32 v14, v29, v14, vcc
        v_cndmask_b32_e32 v4, v18, v4, vcc
        v_cndmask_b32_e32 v15, v17, v30, vcc
        v_cndmask_b32_e32 v16, v24, v25, vcc
        v_cndmask_b32_e32 v17, v22, v21, vcc
        v_cndmask_b32_e32 v18, v33, v23, vcc
        v_cndmask_b32_e32 v7, v32, v7, vcc
        flat_store_dword v[8:9], v7
        flat_store_dword v[2:3], v18
        flat_store_dword v[5:6], v17
        flat_store_dword v[10:11], v16
        flat_store_dword v[12:13], v15
        flat_store_dword v[19:20], v4
        flat_store_dword v[0:1], v14
        s_waitcnt vmcnt(0) lgkmcnt(0)
        s_setpc_b64 s[30:31]
bls24_317_fp_add:                       ; @bls24_317_fp_add
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        s_mov_b32 s6, s33
        s_add_i32 s33, s32, 0xfc0
        s_and_b32 s33, s33, 0xfffff000
        s_addk_i32 s32, 0x5000
        buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
        buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
        flat_load_dword v26, v[2:3]
        v_add_i32_e32 v30, vcc, 36, v2
        v_addc_u32_e32 v31, vcc, 0, v3, vcc
        v_add_i32_e32 v32, vcc, 32, v2
        v_addc_u32_e32 v33, vcc, 0, v3, vcc
        v_add_i32_e32 v34, vcc, 28, v2
        v_addc_u32_e32 v35, vcc, 0, v3, vcc
        v_add_i32_e32 v37, vcc, 24, v2
        v_addc_u32_e32 v38, vcc, 0, v3, vcc
        v_add_i32_e32 v48, vcc, 20, v2
        v_addc_u32_e32 v49, vcc, 0, v3, vcc
        v_add_i32_e32 v50, vcc, 16, v2
        v_addc_u32_e32 v51, vcc, 0, v3, vcc
        v_add_i32_e32 v52, vcc, 12, v2
        v_addc_u32_e32 v53, vcc, 0, v3, vcc
        v_add_i32_e32 v22, vcc, 8, v2
        v_addc_u32_e32 v23, vcc, 0, v3, vcc
        v_add_i32_e32 v24, vcc, 4, v2
        v_addc_u32_e32 v25, vcc, 0, v3, vcc
        v_add_i32_e32 v2, vcc, 36, v4
        v_addc_u32_e32 v3, vcc, 0, v5, vcc
        v_add_i32_e32 v8, vcc, 32, v4
        v_addc_u32_e32 v9, vcc, 0, v5, vcc
        v_add_i32_e32 v6, vcc, 28, v4
        v_addc_u32_e32 v7, vcc, 0, v5, vcc
        v_add_i32_e32 v10, vcc, 24, v4
        v_addc_u32_e32 v11, vcc, 0, v5, vcc
        v_add_i32_e32 v12, vcc, 20, v4
        v_addc_u32_e32 v13, vcc, 0, v5, vcc
        v_add_i32_e32 v14, vcc, 16, v4
        v_addc_u32_e32 v15, vcc, 0, v5, vcc
        v_add_i32_e32 v16, vcc, 12, v4
        v_addc_u32_e32 v17, vcc, 0, v5, vcc
        v_add_i32_e32 v18, vcc, 8, v4
        v_addc_u32_e32 v19, vcc, 0, v5, vcc
        v_add_i32_e32 v20, vcc, 4, v4
        v_addc_u32_e32 v21, vcc, 0, v5, vcc
        flat_load_dword v4, v[4:5]
        v_mov_b32_e32 v5, 0x72aed1a9
        v_mov_b32_e32 v27, 0xcbdb4081
        v_mov_b32_e32 v28, 0x290cc61b
        v_mov_b32_e32 v29, 0x7b38cbb9
        flat_load_dword v31, v[30:31]
        v_mov_b32_e32 v30, 0xe9659e19
        flat_load_dword v33, v[32:33]
        v_mov_b32_e32 v32, 0x48062fc6
        flat_load_dword v36, v[34:35]
        v_mov_b32_e32 v34, 0xd703a5f
        flat_load_dword v38, v[37:38]
        v_mov_b32_e32 v35, 0x909f76d3
        flat_load_dword v39, v[48:49]
        v_mov_b32_e32 v37, 0xefa735dd
        flat_load_dword v50, v[50:51]
        flat_load_dword v51, v[52:53]
        v_add_i32_e32 v48, vcc, 4, v0
        v_addc_u32_e32 v49, vcc, 0, v1, vcc
        flat_load_dword v52, v[22:23]
        flat_load_dword v24, v[24:25]
        v_add_i32_e32 v22, vcc, 8, v0
        v_addc_u32_e32 v23, vcc, 0, v1, vcc
        flat_load_dword v25, v[2:3]
        flat_load_dword v53, v[8:9]
        v_add_i32_e32 v2, vcc, 12, v0
        v_addc_u32_e32 v3, vcc, 0, v1, vcc
        flat_load_dword v54, v[6:7]
        flat_load_dword v55, v[10:11]
        v_add_i32_e32 v6, vcc, 16, v0
        v_addc_u32_e32 v7, vcc, 0, v1, vcc
        flat_load_dword v40, v[12:13]
        flat_load_dword v41, v[14:15]
        v_add_i32_e32 v8, vcc, 20, v0
        v_addc_u32_e32 v9, vcc, 0, v1, vcc
        flat_load_dword v16, v[16:17]
        flat_load_dword v14, v[18:19]
        v_add_i32_e32 v10, vcc, 24, v0
        v_addc_u32_e32 v11, vcc, 0, v1, vcc
        flat_load_dword v15, v[20:21]
        s_waitcnt vmcnt(0) lgkmcnt(0)
        v_add_i32_e32 v18, vcc, v4, v26
        v_add_i32_e64 v12, s[4:5], 28, v0
        v_addc_u32_e64 v13, s[4:5], 0, v1, s[4:5]
        v_addc_u32_e32 v19, vcc, v15, v24, vcc
        v_addc_u32_e32 v20, vcc, v14, v52, vcc
        v_add_i32_e64 v14, s[4:5], 32, v0
        v_addc_u32_e64 v15, s[4:5], 0, v1, s[4:5]
        v_addc_u32_e32 v21, vcc, v16, v51, vcc
        v_addc_u32_e32 v24, vcc, v41, v50, vcc
        v_addc_u32_e32 v26, vcc, v40, v39, vcc
        v_addc_u32_e32 v38, vcc, v55, v38, vcc
        v_addc_u32_e32 v36, vcc, v54, v36, vcc
        v_addc_u32_e32 v33, vcc, v53, v33, vcc
        v_addc_u32_e32 v25, vcc, v25, v31, vcc
        v_add_i32_e32 v31, vcc, 0xa254d555, v18
        v_addc_u32_e32 v39, vcc, v19, v5, vcc
        v_addc_u32_e32 v27, vcc, v20, v27, vcc
        v_addc_u32_e32 v28, vcc, v21, v28, vcc
        v_addc_u32_e32 v29, vcc, v24, v29, vcc
        v_addc_u32_e32 v30, vcc, v26, v30, vcc
        v_addc_u32_e32 v32, vcc, v38, v32, vcc
        v_addc_u32_e32 v34, vcc, v36, v34, vcc
        v_addc_u32_e32 v35, vcc, v33, v35, vcc
        v_addc_u32_e32 v37, vcc, v25, v37, vcc
        v_ashrrev_i32_e32 v4, 31, v37
        v_add_i32_e32 v16, vcc, 36, v0
        v_addc_u32_e32 v17, vcc, 0, v1, vcc
        v_mov_b32_e32 v5, v4
        v_cmp_gt_i64_e32 vcc, 0, v[4:5]
        v_cndmask_b32_e32 v4, v39, v19, vcc
        flat_store_dword v[48:49], v4
        v_cndmask_b32_e32 v4, v31, v18, vcc
        v_cndmask_b32_e32 v5, v37, v25, vcc
        v_cndmask_b32_e32 v18, v35, v33, vcc
        v_cndmask_b32_e32 v19, v34, v36, vcc
        v_cndmask_b32_e32 v25, v32, v38, vcc
        v_cndmask_b32_e32 v26, v30, v26, vcc
        v_cndmask_b32_e32 v24, v29, v24, vcc
        v_cndmask_b32_e32 v21, v28, v21, vcc
        v_cndmask_b32_e32 v20, v27, v20, vcc
        flat_store_dword v[22:23], v20
        flat_store_dword v[2:3], v21
        flat_store_dword v[6:7], v24
        flat_store_dword v[8:9], v26
        flat_store_dword v[10:11], v25
        flat_store_dword v[12:13], v19
        flat_store_dword v[14:15], v18
        flat_store_dword v[16:17], v5
        flat_store_dword v[0:1], v4
        buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
        buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
        s_addk_i32 s32, 0xb000
        s_mov_b32 s33, s6
        s_waitcnt vmcnt(0) lgkmcnt(0)
        s_setpc_b64 s[30:31]
bls12_381_fp_add:                       ; @bls12_381_fp_add
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        s_mov_b32 s4, s33
        s_add_i32 s33, s32, 0xfc0
        s_and_b32 s33, s33, 0xfffff000
        s_addk_i32 s32, 0x5000
        buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
        buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
        buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
        buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
        buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
        buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
        buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
        buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
        buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
        buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
        buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
        buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
        buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
        buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
        buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
        buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
        flat_load_dword v40, v[2:3]
        v_add_i32_e32 v54, vcc, 44, v2
        v_addc_u32_e32 v55, vcc, 0, v3, vcc
        v_add_i32_e32 v6, vcc, 40, v2
        v_addc_u32_e32 v7, vcc, 0, v3, vcc
        v_add_i32_e32 v16, vcc, 36, v2
        v_addc_u32_e32 v17, vcc, 0, v3, vcc
        v_add_i32_e32 v8, vcc, 32, v2
        v_addc_u32_e32 v9, vcc, 0, v3, vcc
        v_add_i32_e32 v18, vcc, 28, v2
        v_addc_u32_e32 v19, vcc, 0, v3, vcc
        v_add_i32_e32 v10, vcc, 24, v2
        v_addc_u32_e32 v11, vcc, 0, v3, vcc
        v_add_i32_e32 v20, vcc, 20, v2
        v_addc_u32_e32 v21, vcc, 0, v3, vcc
        v_add_i32_e32 v12, vcc, 16, v2
        v_addc_u32_e32 v13, vcc, 0, v3, vcc
        v_add_i32_e32 v44, vcc, 12, v2
        v_addc_u32_e32 v45, vcc, 0, v3, vcc
        v_add_i32_e32 v14, vcc, 8, v2
        v_addc_u32_e32 v15, vcc, 0, v3, vcc
        v_add_i32_e32 v24, vcc, 4, v2
        v_addc_u32_e32 v25, vcc, 0, v3, vcc
        v_add_i32_e32 v2, vcc, 44, v4
        v_addc_u32_e32 v3, vcc, 0, v5, vcc
        v_add_i32_e32 v32, vcc, 40, v4
        v_addc_u32_e32 v33, vcc, 0, v5, vcc
        v_add_i32_e32 v28, vcc, 36, v4
        v_addc_u32_e32 v29, vcc, 0, v5, vcc
        v_add_i32_e32 v34, vcc, 32, v4
        v_addc_u32_e32 v35, vcc, 0, v5, vcc
        v_add_i32_e32 v30, vcc, 28, v4
        v_addc_u32_e32 v31, vcc, 0, v5, vcc
        v_add_i32_e32 v26, vcc, 24, v4
        v_addc_u32_e32 v27, vcc, 0, v5, vcc
        v_add_i32_e32 v36, vcc, 20, v4
        v_addc_u32_e32 v37, vcc, 0, v5, vcc
        v_add_i32_e32 v38, vcc, 16, v4
        v_addc_u32_e32 v39, vcc, 0, v5, vcc
        v_add_i32_e32 v48, vcc, 12, v4
        v_addc_u32_e32 v49, vcc, 0, v5, vcc
        v_add_i32_e32 v50, vcc, 8, v4
        v_addc_u32_e32 v51, vcc, 0, v5, vcc
        v_add_i32_e32 v52, vcc, 4, v4
        v_addc_u32_e32 v53, vcc, 0, v5, vcc
        flat_load_dword v41, v[4:5]
        v_mov_b32_e32 v23, 0xe1540001
        v_mov_b32_e32 v46, 0x98cf2d5f
        v_mov_b32_e32 v47, 0xc7aed40
        v_mov_b32_e32 v56, 0x9b88b47b
        v_mov_b32_e32 v57, 0xbcb45328
        v_mov_b32_e32 v58, 0xb4e45849
        v_mov_b32_e32 v59, 0xc6801965
        v_mov_b32_e32 v60, 0xe5feee15
        flat_load_dword v54, v[54:55]
        v_add_i32_e32 v42, vcc, 4, v0
        v_addc_u32_e32 v43, vcc, 0, v1, vcc
        flat_load_dword v55, v[6:7]
        flat_load_dword v61, v[16:17]
        v_add_i32_e32 v6, vcc, 8, v0
        v_addc_u32_e32 v7, vcc, 0, v1, vcc
        flat_load_dword v62, v[8:9]
        flat_load_dword v63, v[18:19]
        v_add_i32_e32 v8, vcc, 12, v0
        v_addc_u32_e32 v9, vcc, 0, v1, vcc
        flat_load_dword v22, v[10:11]
        flat_load_dword v5, v[20:21]
        v_add_i32_e32 v10, vcc, 16, v0
        v_addc_u32_e32 v11, vcc, 0, v1, vcc
        flat_load_dword v4, v[12:13]
        flat_load_dword v44, v[44:45]
        v_add_i32_e32 v12, vcc, 20, v0
        v_addc_u32_e32 v13, vcc, 0, v1, vcc
        flat_load_dword v45, v[14:15]
        flat_load_dword v24, v[24:25]
        v_add_i32_e32 v14, vcc, 24, v0
        v_addc_u32_e32 v15, vcc, 0, v1, vcc
        flat_load_dword v25, v[2:3]
        flat_load_dword v32, v[32:33]
        v_add_i32_e32 v2, vcc, 28, v0
        v_addc_u32_e32 v3, vcc, 0, v1, vcc
        flat_load_dword v28, v[28:29]
        flat_load_dword v29, v[34:35]
        v_add_i32_e32 v16, vcc, 32, v0
        v_addc_u32_e32 v17, vcc, 0, v1, vcc
        flat_load_dword v30, v[30:31]
        flat_load_dword v31, v[48:49]
        v_add_i32_e32 v18, vcc, 36, v0
        v_addc_u32_e32 v19, vcc, 0, v1, vcc
        flat_load_dword v33, v[52:53]
        flat_load_dword v34, v[50:51]
        v_add_i32_e32 v20, vcc, 40, v0
        v_addc_u32_e32 v21, vcc, 0, v1, vcc
        flat_load_dword v26, v[26:27]
        flat_load_dword v27, v[36:37]
        flat_load_dword v35, v[38:39]
        s_waitcnt vmcnt(0) lgkmcnt(0)
        v_add_i32_e32 v36, vcc, v41, v40
        v_addc_u32_e32 v33, vcc, v33, v24, vcc
        v_addc_u32_e32 v34, vcc, v34, v45, vcc
        v_addc_u32_e32 v31, vcc, v31, v44, vcc
        v_addc_u32_e32 v4, vcc, v35, v4, vcc
        v_addc_u32_e32 v5, vcc, v27, v5, vcc
        v_addc_u32_e32 v26, vcc, v26, v22, vcc
        v_addc_u32_e32 v27, vcc, v30, v63, vcc
        v_addc_u32_e32 v29, vcc, v29, v62, vcc
        v_addc_u32_e32 v28, vcc, v28, v61, vcc
        v_addc_u32_e32 v30, vcc, v32, v55, vcc
        v_addc_u32_e32 v32, vcc, v25, v54, vcc
        v_add_i32_e32 v35, vcc, 0x5555, v36
        v_mov_b32_e32 v22, 0x46010000
        v_addc_u32_e32 v37, vcc, v33, v22, vcc
        v_mov_b32_e32 v22, 0x4eac0000
        v_addc_u32_e32 v38, vcc, v34, v22, vcc
        v_addc_u32_e32 v39, vcc, v31, v23, vcc
        v_mov_b32_e32 v22, 0x94f09db
        v_addc_u32_e32 v48, vcc, v4, v22, vcc
        v_addc_u32_e32 v49, vcc, v5, v46, vcc
        v_addc_u32_e32 v50, vcc, v26, v47, vcc
        v_addc_u32_e32 v51, vcc, v27, v56, vcc
        v_addc_u32_e32 v52, vcc, v29, v57, vcc
        v_addc_u32_e32 v53, vcc, v28, v58, vcc
        v_addc_u32_e32 v54, vcc, v30, v59, vcc
        v_addc_u32_e32 v55, vcc, v32, v60, vcc
        v_ashrrev_i32_e32 v22, 31, v55
        v_add_i32_e32 v24, vcc, 44, v0
        v_addc_u32_e32 v25, vcc, 0, v1, vcc
        v_mov_b32_e32 v23, v22
        v_cmp_gt_i64_e32 vcc, 0, v[22:23]
        v_cndmask_b32_e32 v22, v35, v36, vcc
        v_cndmask_b32_e32 v23, v37, v33, vcc
        v_cndmask_b32_e32 v33, v38, v34, vcc
        v_cndmask_b32_e32 v31, v39, v31, vcc
        v_cndmask_b32_e32 v32, v55, v32, vcc
        v_cndmask_b32_e32 v30, v54, v30, vcc
        v_cndmask_b32_e32 v28, v53, v28, vcc
        v_cndmask_b32_e32 v29, v52, v29, vcc
        v_cndmask_b32_e32 v27, v51, v27, vcc
        v_cndmask_b32_e32 v26, v50, v26, vcc
        v_cndmask_b32_e32 v5, v49, v5, vcc
        v_cndmask_b32_e32 v4, v48, v4, vcc
        flat_store_dword v[42:43], v23
        flat_store_dword v[6:7], v33
        flat_store_dword v[8:9], v31
        flat_store_dword v[10:11], v4
        flat_store_dword v[12:13], v5
        flat_store_dword v[14:15], v26
        flat_store_dword v[2:3], v27
        flat_store_dword v[16:17], v29
        flat_store_dword v[18:19], v28
        flat_store_dword v[20:21], v30
        flat_store_dword v[24:25], v32
        flat_store_dword v[0:1], v22
        buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
        buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
        buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
        buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
        buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
        buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
        buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
        buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
        buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
        buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
        buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
        buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
        buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
        buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
        buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
        buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
        s_addk_i32 s32, 0xb000
        s_mov_b32 s33, s4
        s_waitcnt vmcnt(0) lgkmcnt(0)
        s_setpc_b64 s[30:31]
---
amdhsa.target:   amdgcn-amd-amdhsa--gfx700
  - 1
  - 2
mratsim commented 3 months ago

When using the full 256 bits like with secp256k1, we must pay a lot of attention to the inliner: