LLVM's pass ordering chokes on zero-cost abstractions.

eddyb commented 7 years ago

The latest example of this I've seen was posted by @Veedrac to Hacker News (try on playground):

static table: [i32; 4] = [0; 4];

pub fn exists_in_table(v: i32) -> bool {
    for &x in table.iter() {
        if x == v {
            return true;
        }
    }

    false
}

After -C opt-level=3 the ~600 lines of unoptimized IR turn into this (note br i1 false):

; playground::exists_in_table
; Function Attrs: nounwind uwtable
define zeroext i1 @_ZN10playground15exists_in_table17h0a39bf7934b9be37E(i32) unnamed_addr #0 {
start:
  br label %bb3

bb3:                                              ; preds = %start
  br i1 false, label %bb9, label %bb6

bb6:                                              ; preds = %bb3
  %1 = icmp eq i32 %0, 0
  br i1 %1, label %bb9, label %bb3.1

bb9:                                              ; preds = %bb6.4, %bb3.4, %bb6.3, %bb3.3, %bb6.2, %bb3.2, %bb6.1, %bb3.1, %bb3, %bb6
  %_0.0 = phi i1 [ true, %bb6 ], [ false, %bb3 ], [ false, %bb3.1 ], [ true, %bb6.1 ], [ false, %bb3.2 ], [ true, %bb6.2 ], [ false, %bb3.3 ], [ true, %bb6.3 ], [ false, %bb3.4 ], [ true, %bb6.4 ]
  ret i1 %_0.0

bb3.1:                                            ; preds = %bb6
  br i1 false, label %bb9, label %bb6.1

bb6.1:                                            ; preds = %bb3.1
  %2 = icmp eq i32 %0, 0
  br i1 %2, label %bb9, label %bb3.2

bb3.2:                                            ; preds = %bb6.1
  br i1 false, label %bb9, label %bb6.2

bb6.2:                                            ; preds = %bb3.2
  %3 = icmp eq i32 %0, 0
  br i1 %3, label %bb9, label %bb3.3

bb3.3:                                            ; preds = %bb6.2
  br i1 false, label %bb9, label %bb6.3

bb6.3:                                            ; preds = %bb3.3
  %4 = icmp eq i32 %0, 0
  br i1 %4, label %bb9, label %bb3.4

bb3.4:                                            ; preds = %bb6.3
  br i1 true, label %bb9, label %bb6.4

bb6.4:                                            ; preds = %bb3.4
  br label %bb9
}

nagisa commented 7 years ago

cc https://github.com/rust-lang/rust/issues/33299

andjo403 commented 7 years ago

I'm very new to llvm and compilers but I was trying to see what the problem was with this so I was running: rustc -Cllvm-args='-print-after-all' -O --emit llvm-ir badopt.rs |& rustfilt and get a part of the log that looks like this:

*** IR Dump After Loop-Closed SSA Form Pass ***
; Function Attrs: nounwind uwtable
define zeroext i1 @badopt::exists_in_table(i32) unnamed_addr #0 {
start:
  br label %bb3

bb3:                                              ; preds = %bb6, %start
  %iter.sroa.0.0 = phi i32* [ getelementptr inbounds ([4 x i32], [4 x i32]* @badopt::table, i64 0, i64 0), %start ], [ %2, %bb6 ]
  %1 = icmp eq i32* %iter.sroa.0.0, getelementptr inbounds ([4 x i32], [4 x i32]* @badopt::table, i64 1, i64 0)
  br i1 %1, label %bb9, label %bb6

bb6:                                              ; preds = %bb3
  %2 = getelementptr inbounds i32, i32* %iter.sroa.0.0, i64 1
  %3 = load i32, i32* %iter.sroa.0.0, align 4
  %4 = icmp eq i32 %3, %0
  br i1 %4, label %bb9, label %bb3

bb9:                                              ; preds = %bb3, %bb6
  %_0.0 = phi i1 [ true, %bb6 ], [ false, %bb3 ]
  ret i1 %_0.0
}
*** IR Dump After Combine redundant instructions ***
; Function Attrs: nounwind uwtable
define zeroext i1 @badopt::exists_in_table(i32) unnamed_addr #0 {
start:
  br label %bb3

bb3:                                              ; preds = %start
  br i1 false, label %bb9, label %bb6

bb6:                                              ; preds = %bb3
  %1 = icmp eq i32 %0, 0
  br i1 %1, label %bb9, label %bb3.1

bb9:                                              ; preds = %bb6.4, %bb3.4, %bb6.3, %bb3.3, %bb6.2, %bb3.2, %bb6.1, %bb3.1, %bb3, %bb6
  %_0.0 = phi i1 [ true, %bb6 ], [ false, %bb3 ], [ false, %bb3.1 ], [ true, %bb6.1 ], [ false, %bb3.2 ], [ true, %bb6.2 ], [ false, %bb3.3 ], [ true, %bb6.3 ], [ false, %bb3.4 ], [ true, %bb6.4 ]
  ret i1 %_0.0

bb3.1:                                            ; preds = %bb6
  br i1 false, label %bb9, label %bb6.1

bb6.1:                                            ; preds = %bb3.1
  %2 = icmp eq i32 %0, 0
  br i1 %2, label %bb9, label %bb3.2

bb3.2:                                            ; preds = %bb6.1
  br i1 false, label %bb9, label %bb6.2

bb6.2:                                            ; preds = %bb3.2
  %3 = icmp eq i32 %0, 0
  br i1 %3, label %bb9, label %bb3.3

bb3.3:                                            ; preds = %bb6.2
  br i1 false, label %bb9, label %bb6.3

bb6.3:                                            ; preds = %bb3.3
  %4 = icmp eq i32 %0, 0
  br i1 %4, label %bb9, label %bb3.4

bb3.4:                                            ; preds = %bb6.3
  br i1 true, label %bb9, label %bb6.4

bb6.4:                                            ; preds = %bb3.4
  br label %bb9
}

after the Combine redundant instructions (instcombine) pass there is no more Simplify the CFG (simplifycfg). To me it looks like instcombine change the CFG. But the instcombine pass description states that "This pass does not modify the CFG." is it possible that this is the problem?

eddyb commented 7 years ago

@andjo403 Oh wow, that is very useful, thanks! I thought I was wrong in assuming that a very complex interaction between passes causes this result. Those are consecutive passes, right?

I don't understand how there's more basic blocks after instcombine. Looking more at it, the .1, .2, .3 and .4 suffixes suggest loop unrolling, but instcombine shouldn't do that AFAIK.

andjo403 commented 7 years ago

I added -print-before-all also and then there is an Unroll loops between the passes from the log before so instcombine is not the problem. new log:

*** IR Dump Before Unroll loops ***
bb3:                                              ; preds = %bb6, %start
  %iter.sroa.0.0 = phi i32* [ getelementptr inbounds ([4 x i32], [4 x i32]* @badopt::table, i64 0, i64 0), %start ], [ %2, %bb6 ]
  %1 = icmp eq i32* %iter.sroa.0.0, getelementptr inbounds ([4 x i32], [4 x i32]* @badopt::table, i64 1, i64 0)
  br i1 %1, label %bb9, label %bb6

bb6:                                              ; preds = %bb3
  %2 = getelementptr inbounds i32, i32* %iter.sroa.0.0, i64 1
  %3 = load i32, i32* %iter.sroa.0.0, align 4
  %4 = icmp eq i32 %3, %0
  br i1 %4, label %bb9, label %bb3
*** IR Dump Before Combine redundant instructions ***
; Function Attrs: nounwind uwtable
define zeroext i1 @badopt::exists_in_table(i32) unnamed_addr #0 {
start:
  br label %bb3

bb3:                                              ; preds = %start
  br i1 false, label %bb9, label %bb6

bb6:                                              ; preds = %bb3
  %1 = icmp eq i32 0, %0
  br i1 %1, label %bb9, label %bb3.1

bb9:                                              ; preds = %bb6.4, %bb3.4, %bb6.3, %bb3.3, %bb6.2, %bb3.2, %bb6.1, %bb3.1, %bb3, %bb6
  %_0.0 = phi i1 [ true, %bb6 ], [ false, %bb3 ], [ false, %bb3.1 ], [ true, %bb6.1 ], [ false, %bb3.2 ], [ true, %bb6.2 ], [ false, %bb3.3 ], [ true, %bb6.3 ], [ false, %bb3.4 ], [ true, %bb6.4 ]
  ret i1 %_0.0

bb3.1:                                            ; preds = %bb6
  br i1 false, label %bb9, label %bb6.1

bb6.1:                                            ; preds = %bb3.1
  %2 = icmp eq i32 0, %0
  br i1 %2, label %bb9, label %bb3.2

bb3.2:                                            ; preds = %bb6.1
  br i1 false, label %bb9, label %bb6.2

bb6.2:                                            ; preds = %bb3.2
  %3 = icmp eq i32 0, %0
  br i1 %3, label %bb9, label %bb3.3

bb3.3:                                            ; preds = %bb6.2
  br i1 false, label %bb9, label %bb6.3

bb6.3:                                            ; preds = %bb3.3
  %4 = icmp eq i32 0, %0
  br i1 %4, label %bb9, label %bb3.4

bb3.4:                                            ; preds = %bb6.3
  br i1 true, label %bb9, label %bb6.4

bb6.4:                                            ; preds = %bb3.4
  br label %bb9
}

apparently Unroll loops do not make a IR dump after the pass strange.

eddyb commented 7 years ago

Okay, so the question is why would loop unrolling not have a CFG simplification pass after it?

eddyb commented 7 years ago

This seems like the right combination of passes: https://github.com/rust-lang/llvm/blob/d9e7d2696e41983b6b5a0b4fac604b4e548a84d3/lib/Transforms/IPO/PassManagerBuilder.cpp#L782-L790

Also good here: https://github.com/rust-lang/llvm/blob/d9e7d2696e41983b6b5a0b4fac604b4e548a84d3/lib/Transforms/IPO/PassManagerBuilder.cpp#L620-L625

This on the other hand looks bad: https://github.com/rust-lang/llvm/blob/d9e7d2696e41983b6b5a0b4fac604b4e548a84d3/lib/Transforms/IPO/PassManagerBuilder.cpp#L374

And this is probably the one that unrolls the loop here: https://github.com/rust-lang/llvm/blob/d9e7d2696e41983b6b5a0b4fac604b4e548a84d3/lib/Transforms/IPO/PassManagerBuilder.cpp#L629-L632

nossralf commented 7 years ago

So @andjo403 and I spoke about this at lunch the other day, and I took some time today to just add a CFG simplification pass at the places you outlined above to see what happened.

Turns out, adding it on line 633 after the unroll/instruction simplification fixes the issue. The resulting IR for the function reduces to:

; Function Attrs: nounwind uwtable
define zeroext i1 @_ZN6static15exists_in_table17hbdcb2c4e00b87deaE(i32) unnamed_addr #0 {
start:
  %1 = icmp eq i32 %0, 0
  %. = select i1 %1, i1 true, i1 false
  ret i1 %.
}

which is what we want, I believe.

I suspect that the unroll on line 374 may in fact not be a problem thanks to the CFG simplification pass that follows it on line 382. (Unless the LoadCombine/DCE passes that run in between cause changes that negatively affect SimplifyCFG. I know nothing about LLVM internals, but my hunch is that it's unlikely.)

hanna-kruppe commented 7 years ago

That's great!

I can't help but notice, though, that the IR you showed could be simplified further by instcombine. Maybe it would make sense to add the simplifyCFG before the instcombine, or add another instcombine afterwards?

nossralf commented 7 years ago

I'll run a couple of builds trying that out.

nossralf commented 7 years ago

Nice, simply moving SimplifyCFG up before InstCombine gives this:

; Function Attrs: nounwind uwtable
define zeroext i1 @_ZN6static15exists_in_table17hbdcb2c4e00b87deaE(i32) unnamed_addr #0 {
start:
  %1 = icmp eq i32 %0, 0
  ret i1 %1
}

Adding another InstCombine after the SimplifyCFG from my previous attempt gives the same output.

Not sure if there are situations where the longer incantation of Unroll>InstCombine>SimplifyCFG>InstCombine would be preferable to just Unroll>SimplifyCFG>InstCombine.

hanna-kruppe commented 7 years ago

Yeah I'm unsure as well. Running more passes for no good reason should be avoided, but InstCombine can also unlock opportunities for CFG simplification (by turning branch conditions into constants), so in general there no single right order. In fact, some of the PMB pieces @eddyb linked above run InstCombine>SimplifyCFG and others run SimplifyCFG>InstCombine (though none run IC>SimplifyCFG>IC, IIRC). Maybe LLVM developers have an idea? (Has someone already filed an LLVM issue for this?)

nossralf commented 7 years ago

I was doing some legwork whilst writing an LLVM bug for this, and noticed that on LLVM trunk, they've added a SimplifyCFG pass at the very end of the function which I modified above (populateModulePassManager).

It was added in this change: https://reviews.llvm.org/rL301395

I made the corresponding change in the Rust tree, and the end result is the second to last LLVM-IR above, with the icmp and select. So not fully optimal but at least much better. I guess Rust would get this change once LLVM 5.0 is in the tree.

Is there still a point to filing an LLVM bug/request for comment on this? Supposedly the suggestion would then probably be to add another InstCombine after the last SimplifyCFG, which should result in the optimal case. If yes I'll try to get one posted tomorrow.

hanna-kruppe commented 7 years ago

We could bring up whether it might make sense to run SimplifyCFG before instcombine, but tbh it's probably not worth the trouble. @eddyb has had some thoughts about more principled ways to ensure certain simplifications are reliably applied. Such solutions would help all programs, rather than plugging one hole at a time.

I also heard sentiments from LLVM developers that instcombine is already run a bit too often for their tastes, so "pls spam this pass some more so my silly test program is marginally faster" may be dismissed on that grounds as well.

dcci commented 6 years ago

Hi, I'm one of the developers who thinks Instcombine went a little too far :) FWIW, here's my thought (I wrote on the llvm-dev mailing list, but let me copy here in case)

SimplifyCFG is the correct pass for this kind of transformation. I
don't think it's entirely unreasonable to run it more often, but the
compile time cost needs to be taken in account.

I'm not necessarily sympathetic to the idea of adding another canonicalization
pass only for this purpose.

Side note:
IMHO, at this point SimplifyCFG is doing even more than it should
(including, e.g. sinking of variable from "almost empty" BBs, i.e. BBs
which have only a single instruction & terminator, if I recall
correctly). If anything, I'd rather split the sinking logic and other operations
to a different pass, rather than moving simplifying proven unconditional
branches elsewhere :)

Now that LLVM has an incremental dominator API it should be more
feasible to run SimplifyCFG more times without recomputing the
dominator all the time. I haven't checked whether there are other
expensive analyses that need to be preserved

dcci commented 6 years ago

Also, FWIW, to clear other people's doubt. SimplifyCFG + Instcombine when run in tandem don't reach a maximal fixpoint. When running the inliner with function simplification passes pipelines, this is fairly visible (see, e.g. Chandler's talk https://www.youtube.com/watch?v=s4wnuiCwTGU for a more detailed explanation of the problem).

It's of course possible to run instcombine + simplifycfg in a loop n times while inlining and that would presumably make the code faster, at some expense for compile time (in the std::vector<>::push_back example in the talk, that's quantifiable by the number of entries that can be simplified, but in general it should be a good win for some cases.

I guess it's entirely up to the user/downstream language to decide whether once the new pass manager will be in place it's profitable to run these passes in a loop. I think there are plans to implement this feature, but it's unlikely it will be the default (presumably, it will be guarded by a cl::opt or something like that)

arielb1 commented 6 years ago

@dcci

Looking at performance stats again, I don't think adding 2 more calls to SimplifyCfg (one between IndVarSimplify/LoopIdiomRecognize - which I think is fairly important, because people tend to get annoyed about LoopIdiomRecognize not working well - and one right after loop unrolling) would be too much of a problem.

dcci commented 6 years ago

@arielb1 Fair enough. I think it might be a good addition for C++/clang as well, but I haven't gotten the time to check. Some people (TM) are trying very hard upstream to make a mess of SimplifyCFG, but there's hope they'll fail, so I think your plan is reasonable. Can you add performance/compile time numbers after your experiments? Thanks!

cc: @alexcrichton

dcci commented 6 years ago

Also, I haven't looked at the IR produced by Rust very closely but I'd like to point out two things: 1) The current pass manager pipeline (both module and function) has been heavily optimized for C/C++. If Rust wants to propose an alternative, well, maybe we (LLVM) should take a look 2) There have been many discussions about the pipeline getting older and older (and therefore, less tuned). During the ThinLTO bringup people tuned it a bit, but the Module and the LTO pipeline are not necessarily the best (e.g. we run loop vectorizer and loop unroller both in the per-TU pipeline and in the LTO pipeline). In other words, if you're interested, this is something that needs some love.

alexcrichton commented 6 years ago

@dcci heh I'd definitely beleive there's some possible improvements here! AFAIK we're using the "vanilla" pass manager builders in LLVM in the sense of we're at least trying to use the exact same optimization pipeline as C++ in clang. Our codegen I know historically in at least a few places has had to be altered to look more like C++ to get better optimized, but that sort of issue comes up relatively rarely.

the8472 commented 6 years ago

It's of course possible to run instcombine + simplifycfg in a loop n times while inlining and that would presumably make the code faster, at some expense for compile time (in the std::vector<>::push_back example in the talk, that's quantifiable by the number of entries that can be simplified, but in general it should be a good win for some cases.

Would it make sense to apply iterate-to-fixpoint optimization passes on a subset of methods, hotspots found by profiling basically. Either via annotation similar to #[inline] or as part of PGO.

hanna-kruppe commented 6 years ago

@the8472 Most combinations of passes (including pretty basic combinations like SimplifyCFG+InstCombine, as @dcci mentioned in the post you quoted) don't generally converge to a fixed point when run together, i.e., on some code iterate-to-fixpoint would loop infinitely.

the8472 commented 6 years ago

@rkruppe ok, if it does not reach a fixpoint, does it reach fairly small cycles? In that case a cycle finding algorithm using checkpoints could be used to stop the iteration. Additionally the idea would be to run this in combination with inlining (@dcci's push_back example), so a code size constraint could also be added as abort criteria.

hanna-kruppe commented 6 years ago

That's a big if (and getting more unlikely as you add more passes), checkpoints will be tricky to implement and balloon compile time & peak rss even further, and the approach doesn't scale to larger pipelines.

I'm all but convinced that solving phase ordering issues completely can't be achieved by running classical LLVM passes to a fixed point, only by adopting a wholly different approach to optimization (e.g., equality saturation).

eternaleye commented 6 years ago

@rkruppe: It's pretty easy to show that iterating to a fixpoint is insufficient by showing that there are pairs of optimizations for which no ordering is optimal - (register allocation, instruction selection) is something of the "canonical" such pair, with the solution being to fuse the two into a single pass.

Alan Lawrence's thesis on optimizing using the Value-State Dependence Graph (VSDG) representation covers that pretty nicely, along with a variety of related issues.

In addition, because the VSDG eliminates control flow and non-causal ordering from the IR (then reintroduces it optimally during proceduralization and sequentialization), equality saturation performed on the VSDG may be more efficiently implementable (as those are entire classes of equivalences it won't need to analyze or represent).

the8472 commented 6 years ago

@rkruppe yes, the compile times are an obvious concern, that's why I suggested it to only applying it to select functions, not globally. E.g. this comment on PGO mentions a bin crate having one hot code path they'd like to optimize.

I'm all but convinced that solving phase ordering issues completely can't be achieved by running classical LLVM passes to a fixed point

But one could still get an improvement over the current situation.

main-- commented 6 years ago

So if I read this correctly, is this issue the reason why code using Ord::cmp currently compiles to an instruction sequence that first compares the values, conditionally moves -1/0/1 into a register and then compares this again?

andjo403 commented 6 years ago

shall this be marked solved now? the generated code with rustc 1.25.0-nightly (16362c737 2018-02-12):

define zeroext i1 @example::exists_in_table(i32 %v) unnamed_addr #0 !dbg !12 {
  %0 = icmp eq i32 %v, 0, !dbg !14
  ret i1 %0, !dbg !18
}

see https://godbolt.org/g/B9kkeV for diff. solved by #47828

main-- commented 6 years ago

Perhaps this is a different issue, but I'm still seeing std::cmp::Ordering values being materialized to -1/0/1 for no reason: https://godbolt.org/g/mbQDZo

pub fn mytest(x: i32, y: i32, i: usize) -> usize {
    let x = if x == y { ::std::process::exit(0) }
    else if x < y { 0 }
    else { 1 };

    2 * (i + 1) - x
}

Comparing manually is fine and yields perfect code:

example::mytest:
  cmp edi, esi
  je .LBB0_1
  setge al
  add rdx, rdx
  movzx eax, al
  sub rdx, rax
  add rdx, 2
  mov rax, rdx
  ret
.LBB0_1:
  push rbp
  mov rbp, rsp
  xor edi, edi
  call std::process::exit@PLT
  ud2

Using Ord (a zero-cost abstraction, one would think):

use std::cmp::Ordering::*;
pub fn mytest(x: i32, y: i32, i: usize) -> usize {
    let x = match x.cmp(&y) {
        Equal => ::std::process::exit(0),
        Less => 0,
        Greater => 1,
    };

    2 * (i + 1) - x
}

The Ordering value is first materialized into rcx for no reason, then the code branches on that:

example::mytest:
  xor eax, eax
  xor ecx, ecx
  cmp edi, esi
  setge cl
  lea rcx, [rcx + rcx - 1]
  cmove rcx, rax
  cmp rcx, 1
  je .LBB0_3
  test rcx, rcx
  je .LBB0_2
  lea rax, [rax + 2*rdx]
  add rax, 2
  ret
.LBB0_3:
  mov rax, -1
  lea rax, [rax + 2*rdx]
  add rax, 2
  ret
.LBB0_2:
  push rbp
  mov rbp, rsp
  xor edi, edi
  call std::process::exit@PLT
  ud2

This is especially problematic when implementing a generic data structure which has to use Ord and can't simply fall back to an if-else chain (like https://github.com/main--/rust-eytzinger/blob/ea266a38e4c5e72aa0ab934a5bfef8e8fff3bcd8/src/lib.rs#L329-L339).

nagisa commented 6 years ago

This issue is hardly actionable. We should make a tracking issue (possibly converting this to a tracking issue) and associate specific issues with the tracking issue.

I hope that answers questions posed in the two questions above.

the8472 commented 3 years ago

Would this be the right place or has a separate issue been created regarding pass optimizations?

rust-lang / rust

LLVM's pass ordering chokes on zero-cost abstractions. #44041