Open hiraditya opened 6 months ago
Motivating test case: https://godbolt.org/z/6EYs6T4hM
$ opt -S -passes=gvn-hoist
source_filename = "/app/example.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define dso_local void @B(i32 noundef %x, i32 noundef %y) local_unnamed_addr {
entry:
switch i32 %x, label %sw.epilog [
i32 1, label %sw.bb
i32 2, label %sw.bb1
i32 3, label %sw.bb2
i32 4, label %sw.bb4
]
sw.bb:
%sub = sub nsw i32 1, %y
tail call void @foo(i32 noundef %sub) #2
br label %sw.epilog
sw.bb1:
%div = sdiv i32 2, %y
tail call void @bar(i32 noundef %div) #2
br label %sw.epilog
sw.bb2:
%div3 = sdiv i32 2, %y
tail call void @baz(i32 noundef %div3) #2
br label %sw.epilog
sw.bb4:
%div5 = sdiv i32 2, %y
tail call void @bal(i32 noundef %div5) #2
br label %sw.epilog
sw.epilog:
ret void
}
declare void @foo(i32 noundef) local_unnamed_addr #1
declare void @bar(i32 noundef) local_unnamed_addr #1
declare void @baz(i32 noundef) local_unnamed_addr #1
declare void @bal(i32 noundef) local_unnamed_addr #1
!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}
!0 = !{i32 7, !"Dwarf Version", i32 4}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 8, !"PIC Level", i32 2}
!3 = !{i32 7, !"PIE Level", i32 2}
!4 = !{i32 7, !"uwtable", i32 2}
!5 = !{!"clang version 19.0.0git (https://github.com/llvm/llvm-project.git 8b037862b6eabe2efd4b0dcfdb6768484cd10564)"}
@hiraditya How do you propose the transform should be? I could transform it into a fairly poor IR, by inserting another switch. Is there a better way to do this? This might hoist out, but it also incurs additional cost of adding few more instructions like another switch and phis ofcourse. so It might increase instruction count at the expense of hoisting common code. ( This maybe useful when switch has 3 jumps and any two might have common code ) wdyt?
define dso_local void @B(i32 noundef %x, i32 noundef %y) local_unnamed_addr {
entry:
switch i32 %x, label %sw.epilog [
i32 1, label %sw.bb
i32 2, label %sw.pre
i32 3, label %sw.pre
i32 4, label %sw.pre
]
sw.bb:
%sub = sub nsw i32 1, %y
tail call void @foo(i32 noundef %sub) #2
br label %sw.epilog
sw.pre:
%div = sdiv i32 2, %y
switch i32 %x, label %sw.epilog [
i32 2, label %sw.bb1
i32 3, label %sw.bb2
i32 4, label %sw.bb4
]
sw.bb1:
%div_phi1 = phi i32 [ %div, %sw.pre ]
tail call void @bar(i32 noundef %div_phi1) #2
br label %sw.epilog
sw.bb2:
%div_phi2 = phi i32 [ %div, %sw.pre ]
tail call void @baz(i32 noundef %div_phi2) #2
br label %sw.epilog
sw.bb4:
%div_phi4 = phi i32 [ %div, %sw.pre ]
tail call void @bal(i32 noundef %div_phi4) #2
br label %sw.epilog
sw.epilog:
ret void
}
Also I think this sort of thing might be better for if/else blocks, but I think that already might be there. I don't know, if gvn hoist already does this.
Adding codegen by the baseline and transformed IR. On x86 target. command used : llc ir.ll ( no other llc options used )
baseline :
.text
.file "example.c"
.globl B # -- Begin function B
.p2align 4, 0x90
.type B,@function
B: # @B
.cfi_startproc
# %bb.0: # %entry
# kill: def $edi killed $edi def $rdi
decl %edi
cmpl $3, %edi
ja .LBB0_6
# %bb.1: # %entry
jmpq *.LJTI0_0(,%rdi,8)
.LBB0_2: # %sw.bb
movl $1, %edi
subl %esi, %edi
jmp foo@PLT # TAILCALL
.LBB0_4: # %sw.bb2
movl $2, %eax
xorl %edx, %edx
idivl %esi
movl %eax, %edi
jmp baz@PLT # TAILCALL
.LBB0_5: # %sw.bb4
movl $2, %eax
xorl %edx, %edx
idivl %esi
movl %eax, %edi
jmp bal@PLT # TAILCALL
.LBB0_3: # %sw.bb1
movl $2, %eax
xorl %edx, %edx
idivl %esi
movl %eax, %edi
jmp bar@PLT # TAILCALL
.LBB0_6: # %sw.epilog
retq
.Lfunc_end0:
.size B, .Lfunc_end0-B
.cfi_endproc
.section .rodata,"a",@progbits
.p2align 3, 0x0
.LJTI0_0:
.quad .LBB0_2
.quad .LBB0_3
.quad .LBB0_4
.quad .LBB0_5
# -- End function
.ident "clang version 19.0.0git (https://github.com/llvm/llvm-project.git 8b037862b6eabe2efd4b0dcfdb6768484cd10564)"
.section ".note.GNU-stack","",@progbits
transformed :
.text
.file "example.c"
.globl B # -- Begin function B
.p2align 4, 0x90
.type B,@function
B: # @B
.cfi_startproc
# %bb.0: # %entry
# kill: def $edi killed $edi def $rdi
leal -2(%rdi), %eax
cmpl $3, %eax
jae .LBB0_1
# %bb.3: # %sw.pre
movl $2, %eax
xorl %edx, %edx
idivl %esi
cmpl $4, %edi
je .LBB0_8
# %bb.4: # %sw.pre
cmpl $3, %edi
je .LBB0_7
# %bb.5: # %sw.pre
cmpl $2, %edi
jne .LBB0_9
# %bb.6: # %sw.bb1
movl %eax, %edi
jmp bar@PLT # TAILCALL
.LBB0_1: # %entry
cmpl $1, %edi
jne .LBB0_9
# %bb.2: # %sw.bb
movl $1, %edi
subl %esi, %edi
jmp foo@PLT # TAILCALL
.LBB0_9: # %sw.epilog
retq
.LBB0_7: # %sw.bb2
movl %eax, %edi
jmp baz@PLT # TAILCALL
.LBB0_8: # %sw.bb4
movl %eax, %edi
jmp bal@PLT # TAILCALL
.Lfunc_end0:
.size B, .Lfunc_end0-B
.cfi_endproc
# -- End function
.ident "clang version 19.0.0git (https://github.com/llvm/llvm-project.git 8b037862b6eabe2efd4b0dcfdb6768484cd10564)"
.section ".note.GNU-stack","",@progbits
This looks better IMO. So for hoistable candidates in n-1 case statement it should be profitable to do so as the top level switch will become if-else subsequently.
Looking at the newer test that I created using this as a base, which adds 3 hoistable instructions the code generated for x86 looks promising. I see code side reduction. Not sure if it will translate to any perf gains, but I will start preparing a patch and let's verify on some benchmarks.
base-test2.ll
define dso_local void @B(i32 noundef %x, i32 noundef %y) local_unnamed_addr {
entry:
switch i32 %x, label %sw.epilog [
i32 1, label %sw.bb
i32 2, label %sw.bb1
i32 3, label %sw.bb2
i32 4, label %sw.bb4
]
sw.bb:
%sub = sub nsw i32 1, %y
tail call void @foo(i32 noundef %sub) #2
br label %sw.epilog
sw.bb1:
%div = sdiv i32 2, %y
%add = add nsw i32 4, %y
%sub2 = sub nsw i32 3, %y
tail call void @bar(i32 noundef %div) #2
tail call void @bar(i32 noundef %add) #2
tail call void @bar(i32 noundef %sub2) #2
br label %sw.epilog
sw.bb2:
%div3 = sdiv i32 2, %y
%add2 = add nsw i32 4, %y
%sub3 = sub nsw i32 3, %y
tail call void @baz(i32 noundef %div3) #2
tail call void @baz(i32 noundef %add2) #2
tail call void @baz(i32 noundef %sub3) #2
br label %sw.epilog
sw.bb4:
%div5 = sdiv i32 2, %y
%add3 = add nsw i32 4, %y
%sub4 = sub nsw i32 3, %y
tail call void @bal(i32 noundef %div5) #2
tail call void @bal(i32 noundef %add3) #2
tail call void @bal(i32 noundef %sub4) #2
br label %sw.epilog
sw.epilog:
ret void
}
base-test2.s
.text
.file "example.c"
.globl B # -- Begin function B
.p2align 4, 0x90
.type B,@function
B: # @B
.cfi_startproc
# %bb.0: # %entry
pushq %rbp
.cfi_def_cfa_offset 16
pushq %rbx
.cfi_def_cfa_offset 24
pushq %rax
.cfi_def_cfa_offset 32
.cfi_offset %rbx, -24
.cfi_offset %rbp, -16
# kill: def $edi killed $edi def $rdi
decl %edi
cmpl $3, %edi
ja .LBB0_6
# %bb.1: # %entry
jmpq *.LJTI0_0(,%rdi,8)
.LBB0_2: # %sw.bb
movl $1, %edi
subl %esi, %edi
addq $8, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
jmp foo@PLT # TAILCALL
.LBB0_4: # %sw.bb2
.cfi_def_cfa_offset 32
movl $2, %eax
xorl %edx, %edx
idivl %esi
movl $3, %ebx
subl %esi, %ebx
addl $4, %esi
movl %eax, %edi
movl %esi, %ebp
callq baz@PLT
movl %ebp, %edi
callq baz@PLT
movl %ebx, %edi
addq $8, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
jmp baz@PLT # TAILCALL
.LBB0_5: # %sw.bb4
.cfi_def_cfa_offset 32
movl $2, %eax
xorl %edx, %edx
idivl %esi
movl $3, %ebx
subl %esi, %ebx
addl $4, %esi
movl %eax, %edi
movl %esi, %ebp
callq bal@PLT
movl %ebp, %edi
callq bal@PLT
movl %ebx, %edi
addq $8, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
jmp bal@PLT # TAILCALL
.LBB0_3: # %sw.bb1
.cfi_def_cfa_offset 32
movl $2, %eax
xorl %edx, %edx
idivl %esi
movl $3, %ebx
subl %esi, %ebx
addl $4, %esi
movl %eax, %edi
movl %esi, %ebp
callq bar@PLT
movl %ebp, %edi
callq bar@PLT
movl %ebx, %edi
addq $8, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
jmp bar@PLT # TAILCALL
.LBB0_6: # %sw.epilog
.cfi_def_cfa_offset 32
addq $8, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
retq
.Lfunc_end0:
.size B, .Lfunc_end0-B
.cfi_endproc
.section .rodata,"a",@progbits
.p2align 3, 0x0
.LJTI0_0:
.quad .LBB0_2
.quad .LBB0_3
.quad .LBB0_4
.quad .LBB0_5
# -- End function
.ident "clang version 19.0.0git (https://github.com/llvm/llvm-project.git 8b037862b6eabe2efd4b0dcfdb6768484cd10564)"
.section ".note.GNU-stack","",@progbits
new-test2.ll
source_filename = "/app/example.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define dso_local void @B(i32 noundef %x, i32 noundef %y) local_unnamed_addr {
entry:
switch i32 %x, label %sw.epilog [
i32 1, label %sw.bb
i32 2, label %sw.pre
i32 3, label %sw.pre
i32 4, label %sw.pre
]
sw.bb:
%sub = sub nsw i32 1, %y
tail call void @foo(i32 noundef %sub) #2
br label %sw.epilog
sw.pre:
%div = sdiv i32 2, %y
%add = add nsw i32 4, %y
%sub2 = sub nsw i32 3, %y
switch i32 %x, label %sw.epilog [
i32 2, label %sw.bb1
i32 3, label %sw.bb2
i32 4, label %sw.bb4
]
sw.bb1:
%div1_phi = phi i32 [ %div, %sw.pre ]
%add1_phi = phi i32 [ %add, %sw.pre ]
%sub1_phi = phi i32 [ %sub2, %sw.pre ]
tail call void @bar(i32 noundef %div1_phi) #2
tail call void @bar(i32 noundef %add1_phi) #2
tail call void @bar(i32 noundef %sub1_phi) #2
br label %sw.epilog
sw.bb2:
%div2_phi = phi i32 [ %div, %sw.pre ]
%add2_phi = phi i32 [ %add, %sw.pre ]
%sub2_phi = phi i32 [ %sub2, %sw.pre ]
tail call void @baz(i32 noundef %div2_phi) #2
tail call void @baz(i32 noundef %add2_phi) #2
tail call void @baz(i32 noundef %sub2_phi) #2
br label %sw.epilog
sw.bb4:
%div4_phi = phi i32 [ %div, %sw.pre ]
%add4_phi = phi i32 [ %add, %sw.pre ]
%sub4_phi = phi i32 [ %sub2, %sw.pre ]
tail call void @bal(i32 noundef %div4_phi) #2
tail call void @bal(i32 noundef %add4_phi) #2
tail call void @bal(i32 noundef %sub4_phi) #2
br label %sw.epilog
sw.epilog:
ret void
}
declare void @foo(i32 noundef) local_unnamed_addr #1
declare void @bar(i32 noundef) local_unnamed_addr #1
declare void @baz(i32 noundef) local_unnamed_addr #1
declare void @bal(i32 noundef) local_unnamed_addr #1
!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}
!0 = !{i32 7, !"Dwarf Version", i32 4}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 8, !"PIC Level", i32 2}
!3 = !{i32 7, !"PIE Level", i32 2}
!4 = !{i32 7, !"uwtable", i32 2}
!5 = !{!"clang version 19.0.0git (https://github.com/llvm/llvm-project.git 8b037862b6eabe2efd4b0dcfdb6768484cd10564)"}
new-test2.s
.text
.file "example.c"
.globl B # -- Begin function B
.p2align 4, 0x90
.type B,@function
B: # @B
.cfi_startproc
# %bb.0: # %entry
pushq %rbp
.cfi_def_cfa_offset 16
pushq %rbx
.cfi_def_cfa_offset 24
pushq %rax
.cfi_def_cfa_offset 32
.cfi_offset %rbx, -24
.cfi_offset %rbp, -16
movl %esi, %ebx
# kill: def $edi killed $edi def $rdi
leal -2(%rdi), %eax
cmpl $3, %eax
jae .LBB0_1
# %bb.3: # %sw.pre
movl $2, %eax
xorl %edx, %edx
idivl %ebx
movl $3, %ebp
subl %ebx, %ebp
addl $4, %ebx
cmpl $4, %edi
je .LBB0_8
# %bb.4: # %sw.pre
cmpl $3, %edi
je .LBB0_7
# %bb.5: # %sw.pre
cmpl $2, %edi
jne .LBB0_9
# %bb.6: # %sw.bb1
movl %eax, %edi
callq bar@PLT
movl %ebx, %edi
callq bar@PLT
movl %ebp, %edi
addq $8, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
jmp bar@PLT # TAILCALL
.LBB0_1: # %entry
.cfi_def_cfa_offset 32
cmpl $1, %edi
jne .LBB0_9
# %bb.2: # %sw.bb
movl $1, %edi
subl %ebx, %edi
addq $8, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
jmp foo@PLT # TAILCALL
.LBB0_9: # %sw.epilog
.cfi_def_cfa_offset 32
addq $8, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
retq
.LBB0_7: # %sw.bb2
.cfi_def_cfa_offset 32
movl %eax, %edi
callq baz@PLT
movl %ebx, %edi
callq baz@PLT
movl %ebp, %edi
addq $8, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
jmp baz@PLT # TAILCALL
.LBB0_8: # %sw.bb4
.cfi_def_cfa_offset 32
movl %eax, %edi
callq bal@PLT
movl %ebx, %edi
callq bal@PLT
movl %ebp, %edi
addq $8, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
jmp bal@PLT # TAILCALL
.Lfunc_end0:
.size B, .Lfunc_end0-B
.cfi_endproc
# -- End function
.ident "clang version 19.0.0git (https://github.com/llvm/llvm-project.git 8b037862b6eabe2efd4b0dcfdb6768484cd10564)"
.section ".note.GNU-stack","",@progbits
Ran a static perf analysis tool on base.s and new.s using llvm-mca. I see the following data. On the left is data for base.s and right is data for new.s
Dectect A Switch statement. check if there are common instructions in at least two, branches of this switch statement.
It is trivial as switch will have greater than 2 successors. And the CHI node will be incomplete GVNHoist::valueAnticipable
Essentially, if a value is not anticipable, then split (when profitable).
I'm Planning to implement it this way. Currently only taking care of Scalars.
What we have from GVN infrastructure is a Mapping from value numbers to the instructions with the same value numbers. It is like a hash table with value number as the key and a list of instructions as values to the key. Now using this Mapping/Map, what I plan to do is break the problem down into analysis and transform, To perform the analysis I am thinking of building a tree like data structure, a tree with root node, and multiple nodes from root. like a switch tree, from the switch block to the branches of switch. Each node apart from the root node in this tree will have a list of pairs, where each pair will have value number and the pointer to instruction of that value number. The root node will have a dummy value number which is not a valid value number and the pointer to switch instruction itself. After this is constructed we will pass this tree like data structure to an analysis that will tell, given this structure which value numbers should be profitable to be hoisted. Then we will take care of the transform once this analysis works.
How to choose which value numbers to hoist? Ideally we would like to remove maximum redundancy, which can be done by inserting multiple switches in the transform. But I'll stick to inserting only one switch for now. The way we might choose for value number candidates to hoist is, what I would like to define is value number density in a given switch region ( I say a switch region is collection of case BBs of a switch statement. ) the higher this density more profitable/optimal the hoist.
density = (total number of value numbers in candidate hoist set) * (number of BBs the candidate set spans across)
Say a switch has 4 cases. C1, C2, C3 and C4.
If we chose a candidate set as
Hence choosing higher density heuristic might be optimal for creating 1 common dominator/ new switch.
A basic code outline.
struct SwitchTreeNode{
/*
If this is root node then list(<dummyVN, switchInst>) or else it will be a list of list(<V1,I1>, <V2,I2>, <v3,I3>... so on) for each basic block in switch region.
*/
std::map<int, Instruction*> ValueInstList;
/*
Only root node will have children. They will be branches to switch case blocks.
*/
std::vector<SwitchTreeNode*> children;
}
SwitchTreeNode *BuildSwitchtree(SwitchInst *SwI){
}
vector<int> ChooseHoistableCandidates(SwitchTreeNode *SwitchRoot){
}
void performSwitchHoist(Function &F,const VNtoInsns &ScalarsMap){
vector<SwitchInst *SI> SwitchInstrs;
for(BasicBlock &BB : F){
for(Instruction &I : BB){
if(SwitchInst *SwI = dyn_cast<SwitchInst>(&I)){
SwitchTreeNode *SwitchRoot = BuildSwitchtree(SwI);
vector<int> HoistCand = ChooseHoistableCandidates(SwitchRoot);
if(!HoistCand.empty())
// The actual transform routine.
}
}
}
}
Initial implementation POC https://github.com/pawan-nirpal-031/llvm-project/commit/d0672ffaa459e2bed8b97909766936f8388a5dc1
GVNHoist currently bails out unless a value is fully anticipable at a dominator. It may be profitable to split the parents and create a common dominator for all the basic blocks that have a fully anticipable value. e.g.,
If only BB1 and BB2 has
v
(for an instructionI
). it might be profitable to create a common predecessor for BB1,BB2 and hoistI
there.