Open mratsim opened 9 months ago
According to https://stackoverflow.com/questions/33690791/producing-good-add-with-carry-code-from-clang#comment129749304_73472447, __builtin__addcll
is useful on Clang since Clang 10 in 2020.
Fortunately, Clang 10 was released in March 2020, just prior to Ubuntu LTS 20.04, and its the default compiler there:
And Apple M1 or later which have the largest number of ARM users is from November 2020 so should also have proper support (unless Apple didn't merge all upstream improvements)
The __builint_addcll
has been added in GCC 14 because of the _BitInt
. Your example of GCC 13 compiles only because C language allows undefined functions. It's good to enable some warnings at least -Wall -Wextra
.
Here are examples for GCC 14: https://gcc.godbolt.org/z/jx5E3MKhz.
I couldn't make _BitInt(256)
work on clang/ARM. To be investigated.
Some experiments from writing a direct LLVM IR codegenerator that doesn't go through C:
Using the following we can get:
bitwidth | x86_64 | ARM64 |
---|---|---|
i256 | optimal | 1 extra instruction |
i320 | 1 extra instruction | 2 extra instructions |
Now on AMDGPU I'm unsure, it doesn't seem like addition-with-carry are chaining properly
https://alive2.llvm.org/ce/z/qiNxP7
; ModuleID = 'x86_poc'
; target triple = "arm64"
; target triple = "x86_64"
target triple = "amdgcn-amd-amdhsa"
@bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@bls12_381_fp_mod = constant i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787, section "ctt.bls12_381_fp.constants", align 64
@bls24_317_fp_mod = constant i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051, section "ctt.bls24_317_fp.constants", align 64
; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
%a = load i256, ptr %1, align 4
%b = load i256, ptr %2, align 4
%M = load i256, ptr %3, align 4
%a_plus_b = add i256 %a, %b
%5 = sub i256 %a_plus_b, %M
%6 = lshr i256 %5, 255
%7 = trunc i256 %6 to i1
%8 = select i1 %7, i256 %a_plus_b, i256 %5
store i256 %8, ptr %0, align 4
ret void
}
; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
%a = load i320, ptr %1, align 4
%b = load i320, ptr %2, align 4
%M = load i320, ptr %3, align 4
%a_plus_b = add i320 %a, %b
%5 = sub i320 %a_plus_b, %M
%6 = lshr i320 %5, 319
%7 = trunc i320 %6 to i1
%8 = select i1 %7, i320 %a_plus_b, i320 %5
store i320 %8, ptr %0, align 4
ret void
}
; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
%a = load i384, ptr %1, align 4
%b = load i384, ptr %2, align 4
%M = load i384, ptr %3, align 4
%a_plus_b = add i384 %a, %b
%5 = sub i384 %a_plus_b, %M
%6 = lshr i384 %5, 383
%7 = trunc i384 %6 to i1
%8 = select i1 %7, i384 %a_plus_b, i384 %5
store i384 %8, ptr %0, align 4
ret void
}
; Function Attrs: hot
define void @bn254_snarks_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bn254_snarks_fp" {
call fastcc void @_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr @bn254_snarks_fp_mod)
ret void
}
; Function Attrs: hot
define void @bls24_317_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bls24_317_fp" {
call fastcc void @_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr @bls24_317_fp_mod)
ret void
}
; Function Attrs: hot
define void @bls12_381_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bls12_381_fp" {
call fastcc void @_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr @bls12_381_fp_mod)
ret void
}
attributes #2 = { hot }
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"
define void @bn254_snarks_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bn254_snarks_fp" {
%.val = load i256, ptr %1, align 4
%.val1 = load i256, ptr %2, align 4
%a_plus_b.i = add i256 %.val1, %.val
%4 = add i256 %a_plus_b.i, -21888242871839275222246405745257275088696311157297823662689037894645226208583
%.not1.i = icmp slt i256 %4, 0
%5 = select i1 %.not1.i, i256 %a_plus_b.i, i256 %4
store i256 %5, ptr %0, align 4
ret void
}
define void @bls24_317_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bls24_317_fp" {
%.val = load i320, ptr %1, align 4
%.val1 = load i320, ptr %2, align 4
%a_plus_b.i = add i320 %.val1, %.val
%4 = add i320 %a_plus_b.i, -136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051
%.not1.i = icmp slt i320 %4, 0
%5 = select i1 %.not1.i, i320 %a_plus_b.i, i320 %4
store i320 %5, ptr %0, align 4
ret void
}
define void @bls12_381_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bls12_381_fp" {
%.val = load i384, ptr %1, align 4
%.val1 = load i384, ptr %2, align 4
%a_plus_b.i = add i384 %.val1, %.val
%4 = add i384 %a_plus_b.i, -4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787
%.not1.i = icmp slt i384 %4, 0
%5 = select i1 %.not1.i, i384 %a_plus_b.i, i384 %4
store i384 %5, ptr %0, align 4
ret void
}
attributes #0 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
bn254_snarks_fp_add: ; @bn254_snarks_fp_add
s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
flat_load_dword v14, v[2:3]
v_add_i32_e32 v15, vcc, 28, v2
v_addc_u32_e32 v16, vcc, 0, v3, vcc
v_add_i32_e32 v17, vcc, 24, v2
v_addc_u32_e32 v18, vcc, 0, v3, vcc
v_add_i32_e32 v19, vcc, 20, v2
v_addc_u32_e32 v20, vcc, 0, v3, vcc
v_add_i32_e32 v21, vcc, 16, v2
v_addc_u32_e32 v22, vcc, 0, v3, vcc
v_add_i32_e32 v23, vcc, 12, v2
v_addc_u32_e32 v24, vcc, 0, v3, vcc
v_add_i32_e32 v25, vcc, 8, v2
v_addc_u32_e32 v26, vcc, 0, v3, vcc
v_add_i32_e32 v27, vcc, 4, v2
v_addc_u32_e32 v28, vcc, 0, v3, vcc
v_add_i32_e32 v29, vcc, 28, v4
v_addc_u32_e32 v30, vcc, 0, v5, vcc
v_add_i32_e32 v31, vcc, 24, v4
v_addc_u32_e32 v32, vcc, 0, v5, vcc
v_add_i32_e32 v8, vcc, 20, v4
v_addc_u32_e32 v9, vcc, 0, v5, vcc
v_add_i32_e32 v12, vcc, 16, v4
v_addc_u32_e32 v13, vcc, 0, v5, vcc
v_add_i32_e32 v2, vcc, 12, v4
v_addc_u32_e32 v3, vcc, 0, v5, vcc
v_add_i32_e32 v6, vcc, 8, v4
v_addc_u32_e32 v7, vcc, 0, v5, vcc
v_add_i32_e32 v10, vcc, 4, v4
v_addc_u32_e32 v11, vcc, 0, v5, vcc
flat_load_dword v5, v[4:5]
flat_load_dword v4, v[15:16]
v_mov_b32_e32 v33, 0xc3df73e9
flat_load_dword v17, v[17:18]
v_mov_b32_e32 v18, 0x978e3572
flat_load_dword v19, v[19:20]
v_mov_b32_e32 v20, 0x687e956e
flat_load_dword v21, v[21:22]
v_mov_b32_e32 v22, 0x7e7ea7a2
flat_load_dword v23, v[23:24]
v_mov_b32_e32 v24, 0x47afba49
flat_load_dword v25, v[25:26]
v_mov_b32_e32 v26, 0x1ece5fd6
flat_load_dword v27, v[27:28]
v_mov_b32_e32 v28, 0xcf9bb18d
flat_load_dword v29, v[29:30]
flat_load_dword v30, v[31:32]
v_add_i32_e32 v15, vcc, 4, v0
v_addc_u32_e32 v16, vcc, 0, v1, vcc
flat_load_dword v31, v[8:9]
flat_load_dword v12, v[12:13]
v_add_i32_e32 v8, vcc, 8, v0
v_addc_u32_e32 v9, vcc, 0, v1, vcc
flat_load_dword v13, v[2:3]
flat_load_dword v7, v[6:7]
v_add_i32_e32 v2, vcc, 12, v0
v_addc_u32_e32 v3, vcc, 0, v1, vcc
flat_load_dword v10, v[10:11]
s_waitcnt vmcnt(0) lgkmcnt(0)
v_add_i32_e32 v14, vcc, v5, v14
v_add_i32_e64 v5, s[4:5], 16, v0
v_addc_u32_e64 v6, s[4:5], 0, v1, s[4:5]
v_addc_u32_e32 v27, vcc, v10, v27, vcc
v_addc_u32_e32 v7, vcc, v7, v25, vcc
v_add_i32_e64 v10, s[4:5], 20, v0
v_addc_u32_e64 v11, s[4:5], 0, v1, s[4:5]
v_addc_u32_e32 v23, vcc, v13, v23, vcc
v_addc_u32_e32 v21, vcc, v12, v21, vcc
v_add_i32_e64 v12, s[4:5], 24, v0
v_addc_u32_e64 v13, s[4:5], 0, v1, s[4:5]
v_addc_u32_e32 v25, vcc, v31, v19, vcc
v_addc_u32_e32 v30, vcc, v30, v17, vcc
v_addc_u32_e32 v4, vcc, v29, v4, vcc
v_add_i32_e32 v29, vcc, 0x278302b9, v14
v_addc_u32_e32 v31, vcc, v27, v33, vcc
v_addc_u32_e32 v32, vcc, v7, v18, vcc
v_addc_u32_e32 v33, vcc, v23, v20, vcc
v_addc_u32_e32 v22, vcc, v21, v22, vcc
v_addc_u32_e32 v24, vcc, v25, v24, vcc
v_addc_u32_e32 v17, vcc, v30, v26, vcc
v_addc_u32_e32 v18, vcc, v4, v28, vcc
v_add_i32_e32 v19, vcc, 28, v0
v_addc_u32_e32 v20, vcc, 0, v1, vcc
v_cmp_gt_i64_e32 vcc, 0, v[17:18]
v_cndmask_b32_e32 v26, v31, v27, vcc
flat_store_dword v[15:16], v26
v_cndmask_b32_e32 v14, v29, v14, vcc
v_cndmask_b32_e32 v4, v18, v4, vcc
v_cndmask_b32_e32 v15, v17, v30, vcc
v_cndmask_b32_e32 v16, v24, v25, vcc
v_cndmask_b32_e32 v17, v22, v21, vcc
v_cndmask_b32_e32 v18, v33, v23, vcc
v_cndmask_b32_e32 v7, v32, v7, vcc
flat_store_dword v[8:9], v7
flat_store_dword v[2:3], v18
flat_store_dword v[5:6], v17
flat_store_dword v[10:11], v16
flat_store_dword v[12:13], v15
flat_store_dword v[19:20], v4
flat_store_dword v[0:1], v14
s_waitcnt vmcnt(0) lgkmcnt(0)
s_setpc_b64 s[30:31]
bls24_317_fp_add: ; @bls24_317_fp_add
s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
s_mov_b32 s6, s33
s_add_i32 s33, s32, 0xfc0
s_and_b32 s33, s33, 0xfffff000
s_addk_i32 s32, 0x5000
buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
flat_load_dword v26, v[2:3]
v_add_i32_e32 v30, vcc, 36, v2
v_addc_u32_e32 v31, vcc, 0, v3, vcc
v_add_i32_e32 v32, vcc, 32, v2
v_addc_u32_e32 v33, vcc, 0, v3, vcc
v_add_i32_e32 v34, vcc, 28, v2
v_addc_u32_e32 v35, vcc, 0, v3, vcc
v_add_i32_e32 v37, vcc, 24, v2
v_addc_u32_e32 v38, vcc, 0, v3, vcc
v_add_i32_e32 v48, vcc, 20, v2
v_addc_u32_e32 v49, vcc, 0, v3, vcc
v_add_i32_e32 v50, vcc, 16, v2
v_addc_u32_e32 v51, vcc, 0, v3, vcc
v_add_i32_e32 v52, vcc, 12, v2
v_addc_u32_e32 v53, vcc, 0, v3, vcc
v_add_i32_e32 v22, vcc, 8, v2
v_addc_u32_e32 v23, vcc, 0, v3, vcc
v_add_i32_e32 v24, vcc, 4, v2
v_addc_u32_e32 v25, vcc, 0, v3, vcc
v_add_i32_e32 v2, vcc, 36, v4
v_addc_u32_e32 v3, vcc, 0, v5, vcc
v_add_i32_e32 v8, vcc, 32, v4
v_addc_u32_e32 v9, vcc, 0, v5, vcc
v_add_i32_e32 v6, vcc, 28, v4
v_addc_u32_e32 v7, vcc, 0, v5, vcc
v_add_i32_e32 v10, vcc, 24, v4
v_addc_u32_e32 v11, vcc, 0, v5, vcc
v_add_i32_e32 v12, vcc, 20, v4
v_addc_u32_e32 v13, vcc, 0, v5, vcc
v_add_i32_e32 v14, vcc, 16, v4
v_addc_u32_e32 v15, vcc, 0, v5, vcc
v_add_i32_e32 v16, vcc, 12, v4
v_addc_u32_e32 v17, vcc, 0, v5, vcc
v_add_i32_e32 v18, vcc, 8, v4
v_addc_u32_e32 v19, vcc, 0, v5, vcc
v_add_i32_e32 v20, vcc, 4, v4
v_addc_u32_e32 v21, vcc, 0, v5, vcc
flat_load_dword v4, v[4:5]
v_mov_b32_e32 v5, 0x72aed1a9
v_mov_b32_e32 v27, 0xcbdb4081
v_mov_b32_e32 v28, 0x290cc61b
v_mov_b32_e32 v29, 0x7b38cbb9
flat_load_dword v31, v[30:31]
v_mov_b32_e32 v30, 0xe9659e19
flat_load_dword v33, v[32:33]
v_mov_b32_e32 v32, 0x48062fc6
flat_load_dword v36, v[34:35]
v_mov_b32_e32 v34, 0xd703a5f
flat_load_dword v38, v[37:38]
v_mov_b32_e32 v35, 0x909f76d3
flat_load_dword v39, v[48:49]
v_mov_b32_e32 v37, 0xefa735dd
flat_load_dword v50, v[50:51]
flat_load_dword v51, v[52:53]
v_add_i32_e32 v48, vcc, 4, v0
v_addc_u32_e32 v49, vcc, 0, v1, vcc
flat_load_dword v52, v[22:23]
flat_load_dword v24, v[24:25]
v_add_i32_e32 v22, vcc, 8, v0
v_addc_u32_e32 v23, vcc, 0, v1, vcc
flat_load_dword v25, v[2:3]
flat_load_dword v53, v[8:9]
v_add_i32_e32 v2, vcc, 12, v0
v_addc_u32_e32 v3, vcc, 0, v1, vcc
flat_load_dword v54, v[6:7]
flat_load_dword v55, v[10:11]
v_add_i32_e32 v6, vcc, 16, v0
v_addc_u32_e32 v7, vcc, 0, v1, vcc
flat_load_dword v40, v[12:13]
flat_load_dword v41, v[14:15]
v_add_i32_e32 v8, vcc, 20, v0
v_addc_u32_e32 v9, vcc, 0, v1, vcc
flat_load_dword v16, v[16:17]
flat_load_dword v14, v[18:19]
v_add_i32_e32 v10, vcc, 24, v0
v_addc_u32_e32 v11, vcc, 0, v1, vcc
flat_load_dword v15, v[20:21]
s_waitcnt vmcnt(0) lgkmcnt(0)
v_add_i32_e32 v18, vcc, v4, v26
v_add_i32_e64 v12, s[4:5], 28, v0
v_addc_u32_e64 v13, s[4:5], 0, v1, s[4:5]
v_addc_u32_e32 v19, vcc, v15, v24, vcc
v_addc_u32_e32 v20, vcc, v14, v52, vcc
v_add_i32_e64 v14, s[4:5], 32, v0
v_addc_u32_e64 v15, s[4:5], 0, v1, s[4:5]
v_addc_u32_e32 v21, vcc, v16, v51, vcc
v_addc_u32_e32 v24, vcc, v41, v50, vcc
v_addc_u32_e32 v26, vcc, v40, v39, vcc
v_addc_u32_e32 v38, vcc, v55, v38, vcc
v_addc_u32_e32 v36, vcc, v54, v36, vcc
v_addc_u32_e32 v33, vcc, v53, v33, vcc
v_addc_u32_e32 v25, vcc, v25, v31, vcc
v_add_i32_e32 v31, vcc, 0xa254d555, v18
v_addc_u32_e32 v39, vcc, v19, v5, vcc
v_addc_u32_e32 v27, vcc, v20, v27, vcc
v_addc_u32_e32 v28, vcc, v21, v28, vcc
v_addc_u32_e32 v29, vcc, v24, v29, vcc
v_addc_u32_e32 v30, vcc, v26, v30, vcc
v_addc_u32_e32 v32, vcc, v38, v32, vcc
v_addc_u32_e32 v34, vcc, v36, v34, vcc
v_addc_u32_e32 v35, vcc, v33, v35, vcc
v_addc_u32_e32 v37, vcc, v25, v37, vcc
v_ashrrev_i32_e32 v4, 31, v37
v_add_i32_e32 v16, vcc, 36, v0
v_addc_u32_e32 v17, vcc, 0, v1, vcc
v_mov_b32_e32 v5, v4
v_cmp_gt_i64_e32 vcc, 0, v[4:5]
v_cndmask_b32_e32 v4, v39, v19, vcc
flat_store_dword v[48:49], v4
v_cndmask_b32_e32 v4, v31, v18, vcc
v_cndmask_b32_e32 v5, v37, v25, vcc
v_cndmask_b32_e32 v18, v35, v33, vcc
v_cndmask_b32_e32 v19, v34, v36, vcc
v_cndmask_b32_e32 v25, v32, v38, vcc
v_cndmask_b32_e32 v26, v30, v26, vcc
v_cndmask_b32_e32 v24, v29, v24, vcc
v_cndmask_b32_e32 v21, v28, v21, vcc
v_cndmask_b32_e32 v20, v27, v20, vcc
flat_store_dword v[22:23], v20
flat_store_dword v[2:3], v21
flat_store_dword v[6:7], v24
flat_store_dword v[8:9], v26
flat_store_dword v[10:11], v25
flat_store_dword v[12:13], v19
flat_store_dword v[14:15], v18
flat_store_dword v[16:17], v5
flat_store_dword v[0:1], v4
buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
s_addk_i32 s32, 0xb000
s_mov_b32 s33, s6
s_waitcnt vmcnt(0) lgkmcnt(0)
s_setpc_b64 s[30:31]
bls12_381_fp_add: ; @bls12_381_fp_add
s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
s_mov_b32 s4, s33
s_add_i32 s33, s32, 0xfc0
s_and_b32 s33, s33, 0xfffff000
s_addk_i32 s32, 0x5000
buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
flat_load_dword v40, v[2:3]
v_add_i32_e32 v54, vcc, 44, v2
v_addc_u32_e32 v55, vcc, 0, v3, vcc
v_add_i32_e32 v6, vcc, 40, v2
v_addc_u32_e32 v7, vcc, 0, v3, vcc
v_add_i32_e32 v16, vcc, 36, v2
v_addc_u32_e32 v17, vcc, 0, v3, vcc
v_add_i32_e32 v8, vcc, 32, v2
v_addc_u32_e32 v9, vcc, 0, v3, vcc
v_add_i32_e32 v18, vcc, 28, v2
v_addc_u32_e32 v19, vcc, 0, v3, vcc
v_add_i32_e32 v10, vcc, 24, v2
v_addc_u32_e32 v11, vcc, 0, v3, vcc
v_add_i32_e32 v20, vcc, 20, v2
v_addc_u32_e32 v21, vcc, 0, v3, vcc
v_add_i32_e32 v12, vcc, 16, v2
v_addc_u32_e32 v13, vcc, 0, v3, vcc
v_add_i32_e32 v44, vcc, 12, v2
v_addc_u32_e32 v45, vcc, 0, v3, vcc
v_add_i32_e32 v14, vcc, 8, v2
v_addc_u32_e32 v15, vcc, 0, v3, vcc
v_add_i32_e32 v24, vcc, 4, v2
v_addc_u32_e32 v25, vcc, 0, v3, vcc
v_add_i32_e32 v2, vcc, 44, v4
v_addc_u32_e32 v3, vcc, 0, v5, vcc
v_add_i32_e32 v32, vcc, 40, v4
v_addc_u32_e32 v33, vcc, 0, v5, vcc
v_add_i32_e32 v28, vcc, 36, v4
v_addc_u32_e32 v29, vcc, 0, v5, vcc
v_add_i32_e32 v34, vcc, 32, v4
v_addc_u32_e32 v35, vcc, 0, v5, vcc
v_add_i32_e32 v30, vcc, 28, v4
v_addc_u32_e32 v31, vcc, 0, v5, vcc
v_add_i32_e32 v26, vcc, 24, v4
v_addc_u32_e32 v27, vcc, 0, v5, vcc
v_add_i32_e32 v36, vcc, 20, v4
v_addc_u32_e32 v37, vcc, 0, v5, vcc
v_add_i32_e32 v38, vcc, 16, v4
v_addc_u32_e32 v39, vcc, 0, v5, vcc
v_add_i32_e32 v48, vcc, 12, v4
v_addc_u32_e32 v49, vcc, 0, v5, vcc
v_add_i32_e32 v50, vcc, 8, v4
v_addc_u32_e32 v51, vcc, 0, v5, vcc
v_add_i32_e32 v52, vcc, 4, v4
v_addc_u32_e32 v53, vcc, 0, v5, vcc
flat_load_dword v41, v[4:5]
v_mov_b32_e32 v23, 0xe1540001
v_mov_b32_e32 v46, 0x98cf2d5f
v_mov_b32_e32 v47, 0xc7aed40
v_mov_b32_e32 v56, 0x9b88b47b
v_mov_b32_e32 v57, 0xbcb45328
v_mov_b32_e32 v58, 0xb4e45849
v_mov_b32_e32 v59, 0xc6801965
v_mov_b32_e32 v60, 0xe5feee15
flat_load_dword v54, v[54:55]
v_add_i32_e32 v42, vcc, 4, v0
v_addc_u32_e32 v43, vcc, 0, v1, vcc
flat_load_dword v55, v[6:7]
flat_load_dword v61, v[16:17]
v_add_i32_e32 v6, vcc, 8, v0
v_addc_u32_e32 v7, vcc, 0, v1, vcc
flat_load_dword v62, v[8:9]
flat_load_dword v63, v[18:19]
v_add_i32_e32 v8, vcc, 12, v0
v_addc_u32_e32 v9, vcc, 0, v1, vcc
flat_load_dword v22, v[10:11]
flat_load_dword v5, v[20:21]
v_add_i32_e32 v10, vcc, 16, v0
v_addc_u32_e32 v11, vcc, 0, v1, vcc
flat_load_dword v4, v[12:13]
flat_load_dword v44, v[44:45]
v_add_i32_e32 v12, vcc, 20, v0
v_addc_u32_e32 v13, vcc, 0, v1, vcc
flat_load_dword v45, v[14:15]
flat_load_dword v24, v[24:25]
v_add_i32_e32 v14, vcc, 24, v0
v_addc_u32_e32 v15, vcc, 0, v1, vcc
flat_load_dword v25, v[2:3]
flat_load_dword v32, v[32:33]
v_add_i32_e32 v2, vcc, 28, v0
v_addc_u32_e32 v3, vcc, 0, v1, vcc
flat_load_dword v28, v[28:29]
flat_load_dword v29, v[34:35]
v_add_i32_e32 v16, vcc, 32, v0
v_addc_u32_e32 v17, vcc, 0, v1, vcc
flat_load_dword v30, v[30:31]
flat_load_dword v31, v[48:49]
v_add_i32_e32 v18, vcc, 36, v0
v_addc_u32_e32 v19, vcc, 0, v1, vcc
flat_load_dword v33, v[52:53]
flat_load_dword v34, v[50:51]
v_add_i32_e32 v20, vcc, 40, v0
v_addc_u32_e32 v21, vcc, 0, v1, vcc
flat_load_dword v26, v[26:27]
flat_load_dword v27, v[36:37]
flat_load_dword v35, v[38:39]
s_waitcnt vmcnt(0) lgkmcnt(0)
v_add_i32_e32 v36, vcc, v41, v40
v_addc_u32_e32 v33, vcc, v33, v24, vcc
v_addc_u32_e32 v34, vcc, v34, v45, vcc
v_addc_u32_e32 v31, vcc, v31, v44, vcc
v_addc_u32_e32 v4, vcc, v35, v4, vcc
v_addc_u32_e32 v5, vcc, v27, v5, vcc
v_addc_u32_e32 v26, vcc, v26, v22, vcc
v_addc_u32_e32 v27, vcc, v30, v63, vcc
v_addc_u32_e32 v29, vcc, v29, v62, vcc
v_addc_u32_e32 v28, vcc, v28, v61, vcc
v_addc_u32_e32 v30, vcc, v32, v55, vcc
v_addc_u32_e32 v32, vcc, v25, v54, vcc
v_add_i32_e32 v35, vcc, 0x5555, v36
v_mov_b32_e32 v22, 0x46010000
v_addc_u32_e32 v37, vcc, v33, v22, vcc
v_mov_b32_e32 v22, 0x4eac0000
v_addc_u32_e32 v38, vcc, v34, v22, vcc
v_addc_u32_e32 v39, vcc, v31, v23, vcc
v_mov_b32_e32 v22, 0x94f09db
v_addc_u32_e32 v48, vcc, v4, v22, vcc
v_addc_u32_e32 v49, vcc, v5, v46, vcc
v_addc_u32_e32 v50, vcc, v26, v47, vcc
v_addc_u32_e32 v51, vcc, v27, v56, vcc
v_addc_u32_e32 v52, vcc, v29, v57, vcc
v_addc_u32_e32 v53, vcc, v28, v58, vcc
v_addc_u32_e32 v54, vcc, v30, v59, vcc
v_addc_u32_e32 v55, vcc, v32, v60, vcc
v_ashrrev_i32_e32 v22, 31, v55
v_add_i32_e32 v24, vcc, 44, v0
v_addc_u32_e32 v25, vcc, 0, v1, vcc
v_mov_b32_e32 v23, v22
v_cmp_gt_i64_e32 vcc, 0, v[22:23]
v_cndmask_b32_e32 v22, v35, v36, vcc
v_cndmask_b32_e32 v23, v37, v33, vcc
v_cndmask_b32_e32 v33, v38, v34, vcc
v_cndmask_b32_e32 v31, v39, v31, vcc
v_cndmask_b32_e32 v32, v55, v32, vcc
v_cndmask_b32_e32 v30, v54, v30, vcc
v_cndmask_b32_e32 v28, v53, v28, vcc
v_cndmask_b32_e32 v29, v52, v29, vcc
v_cndmask_b32_e32 v27, v51, v27, vcc
v_cndmask_b32_e32 v26, v50, v26, vcc
v_cndmask_b32_e32 v5, v49, v5, vcc
v_cndmask_b32_e32 v4, v48, v4, vcc
flat_store_dword v[42:43], v23
flat_store_dword v[6:7], v33
flat_store_dword v[8:9], v31
flat_store_dword v[10:11], v4
flat_store_dword v[12:13], v5
flat_store_dword v[14:15], v26
flat_store_dword v[2:3], v27
flat_store_dword v[16:17], v29
flat_store_dword v[18:19], v28
flat_store_dword v[20:21], v30
flat_store_dword v[24:25], v32
flat_store_dword v[0:1], v22
buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
s_addk_i32 s32, 0xb000
s_mov_b32 s33, s4
s_waitcnt vmcnt(0) lgkmcnt(0)
s_setpc_b64 s[30:31]
---
amdhsa.target: amdgcn-amd-amdhsa--gfx700
- 1
- 2
When using the full 256 bits like with secp256k1, we must pay a lot of attention to the inliner:
As mentioned in https://github.com/mratsim/constantine/blob/661a481/README-PERFORMANCE.md#compiler-caveats Compilers have a hard time optimizing bigint operations, even as simple as an addition with carries.
This issue track their evolution and the quality of the code generated with compiler builtins for ISAs of interest.
Note that as of February 2024, we use:
_addcarry_u64
on x86-642019, GCC 9.2 and Clang 9.0
The original problem: https://gcc.godbolt.org/z/2h768y
Even with intrinsics, an operation as simple as addition-with-carry is uglily implemented in GCC. This has been mentioned by the GMP folks 30 years ago: https://gmplib.org/manual/Assembly-Carry-Propagation.html
2024, GCC 13.2 and Clang 17.0
https://gcc.godbolt.org/z/jdecvffaP
GMP fixed the x86 intrinsics but unfortunately the portable intrinsics has a terribad codegen and hence makes a terrible fallback for ARM.
Current status
Due to GCC abysmal builtinaddcll it is a non-starter. Clang has decent codegen.
Assembly is still very much needed.
This also explains the bad ARM performance on Apple M1, M2, M3, mentioned by @agnxsh (https://github.com/mratsim/constantine/pull/354#issuecomment-1925688995) and @bkomuves