llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
29.16k stars 12.03k forks source link

GVNHoist: Split Parent when profitable to do so. #91665

Open hiraditya opened 6 months ago

hiraditya commented 6 months ago

GVNHoist currently bails out unless a value is fully anticipable at a dominator. It may be profitable to split the parents and create a common dominator for all the basic blocks that have a fully anticipable value. e.g.,

graph TD;
    BB -->BB1[BB1:\n I1:VN]
    BB -->BB2[BB2:\n I2:VN]
    BB -->BB3[BB3:\n]  

If only BB1 and BB2 has v(for an instruction I). it might be profitable to create a common predecessor for BB1,BB2 and hoist I there.

graph TD;
    BB -->BB0[BB0:\n I1:VN]
    BB0 -->BB1[BB1:\n]
    BB0 -->BB2[BB2:\n]
    BB -->BB3[BB3:\n]
hiraditya commented 5 months ago

Motivating test case: https://godbolt.org/z/6EYs6T4hM

$ opt -S -passes=gvn-hoist

source_filename = "/app/example.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define dso_local void @B(i32 noundef %x, i32 noundef %y) local_unnamed_addr {
entry:
  switch i32 %x, label %sw.epilog [
    i32 1, label %sw.bb
    i32 2, label %sw.bb1
    i32 3, label %sw.bb2
    i32 4, label %sw.bb4
  ]

sw.bb:
  %sub = sub nsw i32 1, %y
  tail call void @foo(i32 noundef %sub) #2
  br label %sw.epilog

sw.bb1:
  %div = sdiv i32 2, %y
  tail call void @bar(i32 noundef %div) #2
  br label %sw.epilog

sw.bb2:
  %div3 = sdiv i32 2, %y
  tail call void @baz(i32 noundef %div3) #2
  br label %sw.epilog

sw.bb4:
  %div5 = sdiv i32 2, %y
  tail call void @bal(i32 noundef %div5) #2
  br label %sw.epilog

sw.epilog:
  ret void
}

declare void @foo(i32 noundef) local_unnamed_addr #1

declare void @bar(i32 noundef) local_unnamed_addr #1

declare void @baz(i32 noundef) local_unnamed_addr #1

declare void @bal(i32 noundef) local_unnamed_addr #1

!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}

!0 = !{i32 7, !"Dwarf Version", i32 4}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 8, !"PIC Level", i32 2}
!3 = !{i32 7, !"PIE Level", i32 2}
!4 = !{i32 7, !"uwtable", i32 2}
!5 = !{!"clang version 19.0.0git (https://github.com/llvm/llvm-project.git 8b037862b6eabe2efd4b0dcfdb6768484cd10564)"}
pawan-nirpal-031 commented 4 months ago

@hiraditya How do you propose the transform should be? I could transform it into a fairly poor IR, by inserting another switch. Is there a better way to do this? This might hoist out, but it also incurs additional cost of adding few more instructions like another switch and phis ofcourse. so It might increase instruction count at the expense of hoisting common code. ( This maybe useful when switch has 3 jumps and any two might have common code ) wdyt?

define dso_local void @B(i32 noundef %x, i32 noundef %y) local_unnamed_addr {
entry:
  switch i32 %x, label %sw.epilog [
    i32 1, label %sw.bb
    i32 2, label %sw.pre
    i32 3, label %sw.pre
    i32 4, label %sw.pre
  ]

sw.bb:
  %sub = sub nsw i32 1, %y
  tail call void @foo(i32 noundef %sub) #2
  br label %sw.epilog

sw.pre:
  %div = sdiv i32 2, %y
  switch i32 %x, label %sw.epilog [
    i32 2, label %sw.bb1
    i32 3, label %sw.bb2
    i32 4, label %sw.bb4
  ]

sw.bb1:
  %div_phi1 = phi i32 [ %div, %sw.pre ]
  tail call void @bar(i32 noundef %div_phi1) #2
  br label %sw.epilog

sw.bb2:
  %div_phi2 = phi i32 [ %div, %sw.pre ]
  tail call void @baz(i32 noundef %div_phi2) #2
  br label %sw.epilog

sw.bb4:
  %div_phi4 = phi i32 [ %div, %sw.pre ]
  tail call void @bal(i32 noundef %div_phi4) #2
  br label %sw.epilog

sw.epilog:
  ret void
}
pawan-nirpal-031 commented 4 months ago

Also I think this sort of thing might be better for if/else blocks, but I think that already might be there. I don't know, if gvn hoist already does this.

pawan-nirpal-031 commented 4 months ago

Adding codegen by the baseline and transformed IR. On x86 target. command used : llc ir.ll ( no other llc options used )

baseline :

    .text
    .file   "example.c"
    .globl  B                               # -- Begin function B
    .p2align    4, 0x90
    .type   B,@function
B:                                      # @B
    .cfi_startproc
# %bb.0:                                # %entry
                                        # kill: def $edi killed $edi def $rdi
    decl    %edi
    cmpl    $3, %edi
    ja  .LBB0_6
# %bb.1:                                # %entry
    jmpq    *.LJTI0_0(,%rdi,8)
.LBB0_2:                                # %sw.bb
    movl    $1, %edi
    subl    %esi, %edi
    jmp foo@PLT                         # TAILCALL
.LBB0_4:                                # %sw.bb2
    movl    $2, %eax
    xorl    %edx, %edx
    idivl   %esi
    movl    %eax, %edi
    jmp baz@PLT                         # TAILCALL
.LBB0_5:                                # %sw.bb4
    movl    $2, %eax
    xorl    %edx, %edx
    idivl   %esi
    movl    %eax, %edi
    jmp bal@PLT                         # TAILCALL
.LBB0_3:                                # %sw.bb1
    movl    $2, %eax
    xorl    %edx, %edx
    idivl   %esi
    movl    %eax, %edi
    jmp bar@PLT                         # TAILCALL
.LBB0_6:                                # %sw.epilog
    retq
.Lfunc_end0:
    .size   B, .Lfunc_end0-B
    .cfi_endproc
    .section    .rodata,"a",@progbits
    .p2align    3, 0x0
.LJTI0_0:
    .quad   .LBB0_2
    .quad   .LBB0_3
    .quad   .LBB0_4
    .quad   .LBB0_5
                                        # -- End function
    .ident  "clang version 19.0.0git (https://github.com/llvm/llvm-project.git 8b037862b6eabe2efd4b0dcfdb6768484cd10564)"
    .section    ".note.GNU-stack","",@progbits

transformed :

    .text
    .file   "example.c"
    .globl  B                               # -- Begin function B
    .p2align    4, 0x90
    .type   B,@function
B:                                      # @B
    .cfi_startproc
# %bb.0:                                # %entry
                                        # kill: def $edi killed $edi def $rdi
    leal    -2(%rdi), %eax
    cmpl    $3, %eax
    jae .LBB0_1
# %bb.3:                                # %sw.pre
    movl    $2, %eax
    xorl    %edx, %edx
    idivl   %esi
    cmpl    $4, %edi
    je  .LBB0_8
# %bb.4:                                # %sw.pre
    cmpl    $3, %edi
    je  .LBB0_7
# %bb.5:                                # %sw.pre
    cmpl    $2, %edi
    jne .LBB0_9
# %bb.6:                                # %sw.bb1
    movl    %eax, %edi
    jmp bar@PLT                         # TAILCALL
.LBB0_1:                                # %entry
    cmpl    $1, %edi
    jne .LBB0_9
# %bb.2:                                # %sw.bb
    movl    $1, %edi
    subl    %esi, %edi
    jmp foo@PLT                         # TAILCALL
.LBB0_9:                                # %sw.epilog
    retq
.LBB0_7:                                # %sw.bb2
    movl    %eax, %edi
    jmp baz@PLT                         # TAILCALL
.LBB0_8:                                # %sw.bb4
    movl    %eax, %edi
    jmp bal@PLT                         # TAILCALL
.Lfunc_end0:
    .size   B, .Lfunc_end0-B
    .cfi_endproc
                                        # -- End function
    .ident  "clang version 19.0.0git (https://github.com/llvm/llvm-project.git 8b037862b6eabe2efd4b0dcfdb6768484cd10564)"
    .section    ".note.GNU-stack","",@progbits
hiraditya commented 4 months ago

This looks better IMO. So for hoistable candidates in n-1 case statement it should be profitable to do so as the top level switch will become if-else subsequently.

pawan-nirpal-031 commented 4 months ago

Looking at the newer test that I created using this as a base, which adds 3 hoistable instructions the code generated for x86 looks promising. I see code side reduction. Not sure if it will translate to any perf gains, but I will start preparing a patch and let's verify on some benchmarks.

base-test2.ll

define dso_local void @B(i32 noundef %x, i32 noundef %y) local_unnamed_addr {
entry:
  switch i32 %x, label %sw.epilog [
    i32 1, label %sw.bb
    i32 2, label %sw.bb1
    i32 3, label %sw.bb2
    i32 4, label %sw.bb4
  ]

sw.bb:
  %sub = sub nsw i32 1, %y
  tail call void @foo(i32 noundef %sub) #2
  br label %sw.epilog

sw.bb1:
  %div = sdiv i32 2, %y
  %add = add nsw i32 4, %y
  %sub2 = sub nsw i32 3, %y
  tail call void @bar(i32 noundef %div) #2
  tail call void @bar(i32 noundef %add) #2
  tail call void @bar(i32 noundef %sub2) #2
  br label %sw.epilog

sw.bb2:
  %div3 = sdiv i32 2, %y
  %add2 = add nsw i32 4, %y
  %sub3 = sub nsw i32 3, %y
  tail call void @baz(i32 noundef %div3) #2
  tail call void @baz(i32 noundef %add2) #2
  tail call void @baz(i32 noundef %sub3) #2
  br label %sw.epilog

sw.bb4:
  %div5 = sdiv i32 2, %y
  %add3 = add nsw i32 4, %y
  %sub4 = sub nsw i32 3, %y
  tail call void @bal(i32 noundef %div5) #2
  tail call void @bal(i32 noundef %add3) #2
  tail call void @bal(i32 noundef %sub4) #2
  br label %sw.epilog

sw.epilog:
  ret void
}

base-test2.s

    .text
    .file   "example.c"
    .globl  B                               # -- Begin function B
    .p2align    4, 0x90
    .type   B,@function
B:                                      # @B
    .cfi_startproc
# %bb.0:                                # %entry
    pushq   %rbp
    .cfi_def_cfa_offset 16
    pushq   %rbx
    .cfi_def_cfa_offset 24
    pushq   %rax
    .cfi_def_cfa_offset 32
    .cfi_offset %rbx, -24
    .cfi_offset %rbp, -16
                                        # kill: def $edi killed $edi def $rdi
    decl    %edi
    cmpl    $3, %edi
    ja  .LBB0_6
# %bb.1:                                # %entry
    jmpq    *.LJTI0_0(,%rdi,8)
.LBB0_2:                                # %sw.bb
    movl    $1, %edi
    subl    %esi, %edi
    addq    $8, %rsp
    .cfi_def_cfa_offset 24
    popq    %rbx
    .cfi_def_cfa_offset 16
    popq    %rbp
    .cfi_def_cfa_offset 8
    jmp foo@PLT                         # TAILCALL
.LBB0_4:                                # %sw.bb2
    .cfi_def_cfa_offset 32
    movl    $2, %eax
    xorl    %edx, %edx
    idivl   %esi
    movl    $3, %ebx
    subl    %esi, %ebx
    addl    $4, %esi
    movl    %eax, %edi
    movl    %esi, %ebp
    callq   baz@PLT
    movl    %ebp, %edi
    callq   baz@PLT
    movl    %ebx, %edi
    addq    $8, %rsp
    .cfi_def_cfa_offset 24
    popq    %rbx
    .cfi_def_cfa_offset 16
    popq    %rbp
    .cfi_def_cfa_offset 8
    jmp baz@PLT                         # TAILCALL
.LBB0_5:                                # %sw.bb4
    .cfi_def_cfa_offset 32
    movl    $2, %eax
    xorl    %edx, %edx
    idivl   %esi
    movl    $3, %ebx
    subl    %esi, %ebx
    addl    $4, %esi
    movl    %eax, %edi
    movl    %esi, %ebp
    callq   bal@PLT
    movl    %ebp, %edi
    callq   bal@PLT
    movl    %ebx, %edi
    addq    $8, %rsp
    .cfi_def_cfa_offset 24
    popq    %rbx
    .cfi_def_cfa_offset 16
    popq    %rbp
    .cfi_def_cfa_offset 8
    jmp bal@PLT                         # TAILCALL
.LBB0_3:                                # %sw.bb1
    .cfi_def_cfa_offset 32
    movl    $2, %eax
    xorl    %edx, %edx
    idivl   %esi
    movl    $3, %ebx
    subl    %esi, %ebx
    addl    $4, %esi
    movl    %eax, %edi
    movl    %esi, %ebp
    callq   bar@PLT
    movl    %ebp, %edi
    callq   bar@PLT
    movl    %ebx, %edi
    addq    $8, %rsp
    .cfi_def_cfa_offset 24
    popq    %rbx
    .cfi_def_cfa_offset 16
    popq    %rbp
    .cfi_def_cfa_offset 8
    jmp bar@PLT                         # TAILCALL
.LBB0_6:                                # %sw.epilog
    .cfi_def_cfa_offset 32
    addq    $8, %rsp
    .cfi_def_cfa_offset 24
    popq    %rbx
    .cfi_def_cfa_offset 16
    popq    %rbp
    .cfi_def_cfa_offset 8
    retq
.Lfunc_end0:
    .size   B, .Lfunc_end0-B
    .cfi_endproc
    .section    .rodata,"a",@progbits
    .p2align    3, 0x0
.LJTI0_0:
    .quad   .LBB0_2
    .quad   .LBB0_3
    .quad   .LBB0_4
    .quad   .LBB0_5
                                        # -- End function
    .ident  "clang version 19.0.0git (https://github.com/llvm/llvm-project.git 8b037862b6eabe2efd4b0dcfdb6768484cd10564)"
    .section    ".note.GNU-stack","",@progbits

new-test2.ll

source_filename = "/app/example.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define dso_local void @B(i32 noundef %x, i32 noundef %y) local_unnamed_addr {
entry:
  switch i32 %x, label %sw.epilog [
    i32 1, label %sw.bb
    i32 2, label %sw.pre
    i32 3, label %sw.pre
    i32 4, label %sw.pre
  ]

sw.bb:
  %sub = sub nsw i32 1, %y
  tail call void @foo(i32 noundef %sub) #2
  br label %sw.epilog

sw.pre:
  %div = sdiv i32 2, %y
  %add = add nsw i32 4, %y
  %sub2 = sub nsw i32 3, %y
  switch i32 %x, label %sw.epilog [
    i32 2, label %sw.bb1
    i32 3, label %sw.bb2
    i32 4, label %sw.bb4
  ]

sw.bb1:
  %div1_phi = phi i32 [ %div, %sw.pre ]
  %add1_phi = phi i32 [ %add, %sw.pre ]
  %sub1_phi = phi i32 [ %sub2, %sw.pre ]
  tail call void @bar(i32 noundef %div1_phi) #2
  tail call void @bar(i32 noundef %add1_phi) #2
  tail call void @bar(i32 noundef %sub1_phi) #2
  br label %sw.epilog

sw.bb2:
  %div2_phi = phi i32 [ %div, %sw.pre ]
  %add2_phi = phi i32 [ %add, %sw.pre ]
  %sub2_phi = phi i32 [ %sub2, %sw.pre ]
  tail call void @baz(i32 noundef %div2_phi) #2
  tail call void @baz(i32 noundef %add2_phi) #2
  tail call void @baz(i32 noundef %sub2_phi) #2
  br label %sw.epilog

sw.bb4:
  %div4_phi = phi i32 [ %div, %sw.pre ]
  %add4_phi = phi i32 [ %add, %sw.pre ]
  %sub4_phi = phi i32 [ %sub2, %sw.pre ]
  tail call void @bal(i32 noundef %div4_phi) #2
  tail call void @bal(i32 noundef %add4_phi) #2
  tail call void @bal(i32 noundef %sub4_phi) #2
  br label %sw.epilog

sw.epilog:
  ret void
}

declare void @foo(i32 noundef) local_unnamed_addr #1

declare void @bar(i32 noundef) local_unnamed_addr #1

declare void @baz(i32 noundef) local_unnamed_addr #1

declare void @bal(i32 noundef) local_unnamed_addr #1

!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}

!0 = !{i32 7, !"Dwarf Version", i32 4}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 8, !"PIC Level", i32 2}
!3 = !{i32 7, !"PIE Level", i32 2}
!4 = !{i32 7, !"uwtable", i32 2}
!5 = !{!"clang version 19.0.0git (https://github.com/llvm/llvm-project.git 8b037862b6eabe2efd4b0dcfdb6768484cd10564)"}

new-test2.s

    .text
    .file   "example.c"
    .globl  B                               # -- Begin function B
    .p2align    4, 0x90
    .type   B,@function
B:                                      # @B
    .cfi_startproc
# %bb.0:                                # %entry
    pushq   %rbp
    .cfi_def_cfa_offset 16
    pushq   %rbx
    .cfi_def_cfa_offset 24
    pushq   %rax
    .cfi_def_cfa_offset 32
    .cfi_offset %rbx, -24
    .cfi_offset %rbp, -16
    movl    %esi, %ebx
                                        # kill: def $edi killed $edi def $rdi
    leal    -2(%rdi), %eax
    cmpl    $3, %eax
    jae .LBB0_1
# %bb.3:                                # %sw.pre
    movl    $2, %eax
    xorl    %edx, %edx
    idivl   %ebx
    movl    $3, %ebp
    subl    %ebx, %ebp
    addl    $4, %ebx
    cmpl    $4, %edi
    je  .LBB0_8
# %bb.4:                                # %sw.pre
    cmpl    $3, %edi
    je  .LBB0_7
# %bb.5:                                # %sw.pre
    cmpl    $2, %edi
    jne .LBB0_9
# %bb.6:                                # %sw.bb1
    movl    %eax, %edi
    callq   bar@PLT
    movl    %ebx, %edi
    callq   bar@PLT
    movl    %ebp, %edi
    addq    $8, %rsp
    .cfi_def_cfa_offset 24
    popq    %rbx
    .cfi_def_cfa_offset 16
    popq    %rbp
    .cfi_def_cfa_offset 8
    jmp bar@PLT                         # TAILCALL
.LBB0_1:                                # %entry
    .cfi_def_cfa_offset 32
    cmpl    $1, %edi
    jne .LBB0_9
# %bb.2:                                # %sw.bb
    movl    $1, %edi
    subl    %ebx, %edi
    addq    $8, %rsp
    .cfi_def_cfa_offset 24
    popq    %rbx
    .cfi_def_cfa_offset 16
    popq    %rbp
    .cfi_def_cfa_offset 8
    jmp foo@PLT                         # TAILCALL
.LBB0_9:                                # %sw.epilog
    .cfi_def_cfa_offset 32
    addq    $8, %rsp
    .cfi_def_cfa_offset 24
    popq    %rbx
    .cfi_def_cfa_offset 16
    popq    %rbp
    .cfi_def_cfa_offset 8
    retq
.LBB0_7:                                # %sw.bb2
    .cfi_def_cfa_offset 32
    movl    %eax, %edi
    callq   baz@PLT
    movl    %ebx, %edi
    callq   baz@PLT
    movl    %ebp, %edi
    addq    $8, %rsp
    .cfi_def_cfa_offset 24
    popq    %rbx
    .cfi_def_cfa_offset 16
    popq    %rbp
    .cfi_def_cfa_offset 8
    jmp baz@PLT                         # TAILCALL
.LBB0_8:                                # %sw.bb4
    .cfi_def_cfa_offset 32
    movl    %eax, %edi
    callq   bal@PLT
    movl    %ebx, %edi
    callq   bal@PLT
    movl    %ebp, %edi
    addq    $8, %rsp
    .cfi_def_cfa_offset 24
    popq    %rbx
    .cfi_def_cfa_offset 16
    popq    %rbp
    .cfi_def_cfa_offset 8
    jmp bal@PLT                         # TAILCALL
.Lfunc_end0:
    .size   B, .Lfunc_end0-B
    .cfi_endproc
                                        # -- End function
    .ident  "clang version 19.0.0git (https://github.com/llvm/llvm-project.git 8b037862b6eabe2efd4b0dcfdb6768484cd10564)"
    .section    ".note.GNU-stack","",@progbits
pawan-nirpal-031 commented 4 months ago

Ran a static perf analysis tool on base.s and new.s using llvm-mca. I see the following data. On the left is data for base.s and right is data for new.s image

hiraditya commented 4 months ago

Dectect A Switch statement. check if there are common instructions in at least two, branches of this switch statement.

It is trivial as switch will have greater than 2 successors. And the CHI node will be incomplete GVNHoist::valueAnticipable

Essentially, if a value is not anticipable, then split (when profitable).

pawan-nirpal-031 commented 3 months ago

I'm Planning to implement it this way. Currently only taking care of Scalars.

What we have from GVN infrastructure is a Mapping from value numbers to the instructions with the same value numbers. It is like a hash table with value number as the key and a list of instructions as values to the key. Now using this Mapping/Map, what I plan to do is break the problem down into analysis and transform, To perform the analysis I am thinking of building a tree like data structure, a tree with root node, and multiple nodes from root. like a switch tree, from the switch block to the branches of switch. Each node apart from the root node in this tree will have a list of pairs, where each pair will have value number and the pointer to instruction of that value number. The root node will have a dummy value number which is not a valid value number and the pointer to switch instruction itself. After this is constructed we will pass this tree like data structure to an analysis that will tell, given this structure which value numbers should be profitable to be hoisted. Then we will take care of the transform once this analysis works.

How to choose which value numbers to hoist? Ideally we would like to remove maximum redundancy, which can be done by inserting multiple switches in the transform. But I'll stick to inserting only one switch for now. The way we might choose for value number candidates to hoist is, what I would like to define is value number density in a given switch region ( I say a switch region is collection of case BBs of a switch statement. ) the higher this density more profitable/optimal the hoist.

density = (total number of value numbers in candidate hoist set) * (number of BBs the candidate set spans across)

Say a switch has 4 cases. C1, C2, C3 and C4.

If we chose a candidate set as

Hence choosing higher density heuristic might be optimal for creating 1 common dominator/ new switch.

A basic code outline.

struct SwitchTreeNode{
  /*
    If this is root node then list(<dummyVN, switchInst>) or else it will be a list of list(<V1,I1>, <V2,I2>, <v3,I3>... so on) for each basic block in switch region.
  */
  std::map<int, Instruction*> ValueInstList;
  /*
    Only root node will have children. They will be branches to switch case blocks. 
  */
  std::vector<SwitchTreeNode*> children;

}

SwitchTreeNode *BuildSwitchtree(SwitchInst *SwI){

}

vector<int> ChooseHoistableCandidates(SwitchTreeNode *SwitchRoot){

}

void performSwitchHoist(Function &F,const VNtoInsns &ScalarsMap){
  vector<SwitchInst *SI> SwitchInstrs;
  for(BasicBlock &BB : F){
    for(Instruction &I : BB){
      if(SwitchInst *SwI = dyn_cast<SwitchInst>(&I)){
        SwitchTreeNode *SwitchRoot = BuildSwitchtree(SwI);
        vector<int> HoistCand = ChooseHoistableCandidates(SwitchRoot);
        if(!HoistCand.empty())
          // The actual transform routine. 
      }
    }
  }
}
pawan-nirpal-031 commented 3 months ago

Initial implementation POC https://github.com/pawan-nirpal-031/llvm-project/commit/d0672ffaa459e2bed8b97909766936f8388a5dc1