Open mratsim opened 1 month ago
It seems from playing with LLVMRunPasses
that the scc-oz-module-inliner
pass triggers this behavior reliably
https://alive2.llvm.org/ce/z/htetw5
Original LLVM IR
; ModuleID = 'x86_poc'
source_filename = "x86_poc"
target triple = "x86_64-pc-linux-gnu"
@bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@bn254_snarks_fr_mod = constant i256 21888242871839275222246405745257275088548364400416034343698204186575808495617, section "ctt.bn254_snarks_fr.constants", align 64
define void @bn254_snarks_fp_add(ptr %0, ptr %1, ptr %2) section "bn254_snarks_fp" {
call fastcc void @_modadd_noo_u64x4(ptr %0, ptr %1, ptr %2, ptr @bn254_snarks_fp_mod)
ret void
}
define internal fastcc void @_modadd_noo_u64x4(ptr %0, ptr %1, ptr %2, ptr %3) section "ctt.fields" {
%a = load i256, ptr %1, align 4
%b = load i256, ptr %2, align 4
%a_plus_b = add i256 %a, %b
%5 = alloca [4 x i64], align 8
store i256 %a_plus_b, ptr %5, align 4
call fastcc void @_finalsub_noo_u64x4(ptr %0, ptr %5, ptr %3)
ret void
}
define internal fastcc void @_finalsub_noo_u64x4(ptr %0, ptr %1, ptr %2) section "ctt.fields" {
%M = load i256, ptr %2, align 4
%a = load i256, ptr %1, align 4
%a_minus_M = sub i256 %a, %M
%borrow = icmp ult i256 %a, %M
%4 = select i1 %borrow, i256 %a, i256 %a_minus_M
store i256 %4, ptr %0, align 4
ret void
}
; Comment this out for good codegen (or use Clang -O0)
define void @bn254_snarks_fr_add(ptr %0, ptr %1, ptr %2) section "bn254_snarks_fr" {
call fastcc void @_modadd_noo_u64x4(ptr %0, ptr %1, ptr %2, ptr @bn254_snarks_fr_mod)
ret void
}
Pessimized
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
define void @bn254_snarks_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) #0 section "bn254_snarks_fp" {
%a.i = load i256, ptr %1, align 4
%b.i = load i256, ptr %2, align 4
%a_plus_b.i = add i256 %b.i, %a.i
%borrow.i.i = icmp ult i256 %a_plus_b.i, 21888242871839275222246405745257275088696311157297823662689037894645226208583
%a_minus_M.i.i.neg = select i1 %borrow.i.i, i256 0, i256 -21888242871839275222246405745257275088696311157297823662689037894645226208583
%4 = add i256 %a_minus_M.i.i.neg, %a_plus_b.i
store i256 %4, ptr %0, align 4
ret void
}
define void @bn254_snarks_fr_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) #0 section "bn254_snarks_fr" {
%a.i = load i256, ptr %1, align 4
%b.i = load i256, ptr %2, align 4
%a_plus_b.i = add i256 %b.i, %a.i
%borrow.i.i = icmp ult i256 %a_plus_b.i, 21888242871839275222246405745257275088548364400416034343698204186575808495617
%a_minus_M.i.i.neg = select i1 %borrow.i.i, i256 0, i256 -21888242871839275222246405745257275088548364400416034343698204186575808495617
%4 = add i256 %a_minus_M.i.i.neg, %a_plus_b.i
store i256 %4, ptr %0, align 4
ret void
}
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
This is a follow-up to my quest to do efficient cross-ISA modular arithmetic for cryptography and finding a workaround to #102062
Ignoring the loads/stores, the following LLVM IR is optimized into an optimal sequence of adc,sbb,cmov with
or a single calleredit: the sequence is still unoptimalwhen compiled with Clang
https://alive2.llvm.org/ce/z/hnQycG
but it becomes adc,sbb,cmov,adc for a 33% more compute instructions with the following IR.
LLVM IR
Clang -O1
Clang -O0
Clang -O1 but with only a single proc
edit: actually it is unoptimal, the first add is repeated
Extra context
Modular addition is critical to optimize for cryptography and is used everywhere (HTTPS / authentication to websites). Currently state-of-the-art libraries have to use assembly for both speed and correctness reasons (constant-time) and improved compiler support is important for more robust software and also usage on wider hardware (WASM, GPUs, FPGAs, ...)
Reproduction
I used to be able to reproduce it with
llc
andopt
,opt
moved the select one instruction apart from the borrow and reordered instructions so that either 0 or he modulus was added. Unfortunately I lost the exact fiddling that produced that output.