Open eddyb opened 7 years ago
I'm very new to llvm and compilers but I was trying to see what the problem was with this so I was running:
rustc -Cllvm-args='-print-after-all' -O --emit llvm-ir badopt.rs |& rustfilt
and get a part of the log that looks like this:
*** IR Dump After Loop-Closed SSA Form Pass ***
; Function Attrs: nounwind uwtable
define zeroext i1 @badopt::exists_in_table(i32) unnamed_addr #0 {
start:
br label %bb3
bb3: ; preds = %bb6, %start
%iter.sroa.0.0 = phi i32* [ getelementptr inbounds ([4 x i32], [4 x i32]* @badopt::table, i64 0, i64 0), %start ], [ %2, %bb6 ]
%1 = icmp eq i32* %iter.sroa.0.0, getelementptr inbounds ([4 x i32], [4 x i32]* @badopt::table, i64 1, i64 0)
br i1 %1, label %bb9, label %bb6
bb6: ; preds = %bb3
%2 = getelementptr inbounds i32, i32* %iter.sroa.0.0, i64 1
%3 = load i32, i32* %iter.sroa.0.0, align 4
%4 = icmp eq i32 %3, %0
br i1 %4, label %bb9, label %bb3
bb9: ; preds = %bb3, %bb6
%_0.0 = phi i1 [ true, %bb6 ], [ false, %bb3 ]
ret i1 %_0.0
}
*** IR Dump After Combine redundant instructions ***
; Function Attrs: nounwind uwtable
define zeroext i1 @badopt::exists_in_table(i32) unnamed_addr #0 {
start:
br label %bb3
bb3: ; preds = %start
br i1 false, label %bb9, label %bb6
bb6: ; preds = %bb3
%1 = icmp eq i32 %0, 0
br i1 %1, label %bb9, label %bb3.1
bb9: ; preds = %bb6.4, %bb3.4, %bb6.3, %bb3.3, %bb6.2, %bb3.2, %bb6.1, %bb3.1, %bb3, %bb6
%_0.0 = phi i1 [ true, %bb6 ], [ false, %bb3 ], [ false, %bb3.1 ], [ true, %bb6.1 ], [ false, %bb3.2 ], [ true, %bb6.2 ], [ false, %bb3.3 ], [ true, %bb6.3 ], [ false, %bb3.4 ], [ true, %bb6.4 ]
ret i1 %_0.0
bb3.1: ; preds = %bb6
br i1 false, label %bb9, label %bb6.1
bb6.1: ; preds = %bb3.1
%2 = icmp eq i32 %0, 0
br i1 %2, label %bb9, label %bb3.2
bb3.2: ; preds = %bb6.1
br i1 false, label %bb9, label %bb6.2
bb6.2: ; preds = %bb3.2
%3 = icmp eq i32 %0, 0
br i1 %3, label %bb9, label %bb3.3
bb3.3: ; preds = %bb6.2
br i1 false, label %bb9, label %bb6.3
bb6.3: ; preds = %bb3.3
%4 = icmp eq i32 %0, 0
br i1 %4, label %bb9, label %bb3.4
bb3.4: ; preds = %bb6.3
br i1 true, label %bb9, label %bb6.4
bb6.4: ; preds = %bb3.4
br label %bb9
}
after the Combine redundant instructions (instcombine) pass there is no more Simplify the CFG (simplifycfg). To me it looks like instcombine change the CFG. But the instcombine pass description states that "This pass does not modify the CFG." is it possible that this is the problem?
@andjo403 Oh wow, that is very useful, thanks! I thought I was wrong in assuming that a very complex interaction between passes causes this result. Those are consecutive passes, right?
I don't understand how there's more basic blocks after instcombine. Looking more at it, the .1
, .2
, .3
and .4
suffixes suggest loop unrolling, but instcombine shouldn't do that AFAIK.
I added -print-before-all also and then there is an Unroll loops between the passes from the log before so instcombine is not the problem. new log:
*** IR Dump Before Unroll loops ***
bb3: ; preds = %bb6, %start
%iter.sroa.0.0 = phi i32* [ getelementptr inbounds ([4 x i32], [4 x i32]* @badopt::table, i64 0, i64 0), %start ], [ %2, %bb6 ]
%1 = icmp eq i32* %iter.sroa.0.0, getelementptr inbounds ([4 x i32], [4 x i32]* @badopt::table, i64 1, i64 0)
br i1 %1, label %bb9, label %bb6
bb6: ; preds = %bb3
%2 = getelementptr inbounds i32, i32* %iter.sroa.0.0, i64 1
%3 = load i32, i32* %iter.sroa.0.0, align 4
%4 = icmp eq i32 %3, %0
br i1 %4, label %bb9, label %bb3
*** IR Dump Before Combine redundant instructions ***
; Function Attrs: nounwind uwtable
define zeroext i1 @badopt::exists_in_table(i32) unnamed_addr #0 {
start:
br label %bb3
bb3: ; preds = %start
br i1 false, label %bb9, label %bb6
bb6: ; preds = %bb3
%1 = icmp eq i32 0, %0
br i1 %1, label %bb9, label %bb3.1
bb9: ; preds = %bb6.4, %bb3.4, %bb6.3, %bb3.3, %bb6.2, %bb3.2, %bb6.1, %bb3.1, %bb3, %bb6
%_0.0 = phi i1 [ true, %bb6 ], [ false, %bb3 ], [ false, %bb3.1 ], [ true, %bb6.1 ], [ false, %bb3.2 ], [ true, %bb6.2 ], [ false, %bb3.3 ], [ true, %bb6.3 ], [ false, %bb3.4 ], [ true, %bb6.4 ]
ret i1 %_0.0
bb3.1: ; preds = %bb6
br i1 false, label %bb9, label %bb6.1
bb6.1: ; preds = %bb3.1
%2 = icmp eq i32 0, %0
br i1 %2, label %bb9, label %bb3.2
bb3.2: ; preds = %bb6.1
br i1 false, label %bb9, label %bb6.2
bb6.2: ; preds = %bb3.2
%3 = icmp eq i32 0, %0
br i1 %3, label %bb9, label %bb3.3
bb3.3: ; preds = %bb6.2
br i1 false, label %bb9, label %bb6.3
bb6.3: ; preds = %bb3.3
%4 = icmp eq i32 0, %0
br i1 %4, label %bb9, label %bb3.4
bb3.4: ; preds = %bb6.3
br i1 true, label %bb9, label %bb6.4
bb6.4: ; preds = %bb3.4
br label %bb9
}
apparently Unroll loops do not make a IR dump after the pass strange.
Okay, so the question is why would loop unrolling not have a CFG simplification pass after it?
This seems like the right combination of passes: https://github.com/rust-lang/llvm/blob/d9e7d2696e41983b6b5a0b4fac604b4e548a84d3/lib/Transforms/IPO/PassManagerBuilder.cpp#L782-L790
Also good here: https://github.com/rust-lang/llvm/blob/d9e7d2696e41983b6b5a0b4fac604b4e548a84d3/lib/Transforms/IPO/PassManagerBuilder.cpp#L620-L625
This on the other hand looks bad: https://github.com/rust-lang/llvm/blob/d9e7d2696e41983b6b5a0b4fac604b4e548a84d3/lib/Transforms/IPO/PassManagerBuilder.cpp#L374
And this is probably the one that unrolls the loop here: https://github.com/rust-lang/llvm/blob/d9e7d2696e41983b6b5a0b4fac604b4e548a84d3/lib/Transforms/IPO/PassManagerBuilder.cpp#L629-L632
So @andjo403 and I spoke about this at lunch the other day, and I took some time today to just add a CFG simplification pass at the places you outlined above to see what happened.
Turns out, adding it on line 633 after the unroll/instruction simplification fixes the issue. The resulting IR for the function reduces to:
; Function Attrs: nounwind uwtable
define zeroext i1 @_ZN6static15exists_in_table17hbdcb2c4e00b87deaE(i32) unnamed_addr #0 {
start:
%1 = icmp eq i32 %0, 0
%. = select i1 %1, i1 true, i1 false
ret i1 %.
}
which is what we want, I believe.
I suspect that the unroll on line 374 may in fact not be a problem thanks to the CFG simplification pass that follows it on line 382. (Unless the LoadCombine/DCE passes that run in between cause changes that negatively affect SimplifyCFG. I know nothing about LLVM internals, but my hunch is that it's unlikely.)
That's great!
I can't help but notice, though, that the IR you showed could be simplified further by instcombine. Maybe it would make sense to add the simplifyCFG before the instcombine, or add another instcombine afterwards?
I'll run a couple of builds trying that out.
Nice, simply moving SimplifyCFG up before InstCombine gives this:
; Function Attrs: nounwind uwtable
define zeroext i1 @_ZN6static15exists_in_table17hbdcb2c4e00b87deaE(i32) unnamed_addr #0 {
start:
%1 = icmp eq i32 %0, 0
ret i1 %1
}
Adding another InstCombine after the SimplifyCFG from my previous attempt gives the same output.
Not sure if there are situations where the longer incantation of Unroll>InstCombine>SimplifyCFG>InstCombine would be preferable to just Unroll>SimplifyCFG>InstCombine.
Yeah I'm unsure as well. Running more passes for no good reason should be avoided, but InstCombine can also unlock opportunities for CFG simplification (by turning branch conditions into constants), so in general there no single right order. In fact, some of the PMB pieces @eddyb linked above run InstCombine>SimplifyCFG and others run SimplifyCFG>InstCombine (though none run IC>SimplifyCFG>IC, IIRC). Maybe LLVM developers have an idea? (Has someone already filed an LLVM issue for this?)
I was doing some legwork whilst writing an LLVM bug for this, and noticed that on LLVM trunk, they've added a SimplifyCFG pass at the very end of the function which I modified above (populateModulePassManager).
It was added in this change: https://reviews.llvm.org/rL301395
I made the corresponding change in the Rust tree, and the end result is the second to last LLVM-IR above, with the icmp and select. So not fully optimal but at least much better. I guess Rust would get this change once LLVM 5.0 is in the tree.
Is there still a point to filing an LLVM bug/request for comment on this? Supposedly the suggestion would then probably be to add another InstCombine after the last SimplifyCFG, which should result in the optimal case. If yes I'll try to get one posted tomorrow.
We could bring up whether it might make sense to run SimplifyCFG before instcombine, but tbh it's probably not worth the trouble. @eddyb has had some thoughts about more principled ways to ensure certain simplifications are reliably applied. Such solutions would help all programs, rather than plugging one hole at a time.
I also heard sentiments from LLVM developers that instcombine is already run a bit too often for their tastes, so "pls spam this pass some more so my silly test program is marginally faster" may be dismissed on that grounds as well.
Hi, I'm one of the developers who thinks Instcombine went a little too far :) FWIW, here's my thought (I wrote on the llvm-dev mailing list, but let me copy here in case)
SimplifyCFG is the correct pass for this kind of transformation. I
don't think it's entirely unreasonable to run it more often, but the
compile time cost needs to be taken in account.
I'm not necessarily sympathetic to the idea of adding another canonicalization
pass only for this purpose.
Side note:
IMHO, at this point SimplifyCFG is doing even more than it should
(including, e.g. sinking of variable from "almost empty" BBs, i.e. BBs
which have only a single instruction & terminator, if I recall
correctly). If anything, I'd rather split the sinking logic and other operations
to a different pass, rather than moving simplifying proven unconditional
branches elsewhere :)
Now that LLVM has an incremental dominator API it should be more
feasible to run SimplifyCFG more times without recomputing the
dominator all the time. I haven't checked whether there are other
expensive analyses that need to be preserved
Also, FWIW, to clear other people's doubt. SimplifyCFG + Instcombine when run in tandem don't reach a maximal fixpoint. When running the inliner with function simplification passes pipelines, this is fairly visible (see, e.g. Chandler's talk https://www.youtube.com/watch?v=s4wnuiCwTGU for a more detailed explanation of the problem).
It's of course possible to run instcombine + simplifycfg in a loop n times while inlining and that would presumably make the code faster, at some expense for compile time (in the std::vector<>::push_back
example in the talk, that's quantifiable by the number of entries that can be simplified, but in general it should be a good win for some cases.
I guess it's entirely up to the user/downstream language to decide whether once the new pass manager will be in place it's profitable to run these passes in a loop. I think there are plans to implement this feature, but it's unlikely it will be the default (presumably, it will be guarded by a cl::opt
or something like that)
@dcci
Looking at performance stats again, I don't think adding 2 more calls to SimplifyCfg
(one between IndVarSimplify/LoopIdiomRecognize - which I think is fairly important, because people tend to get annoyed about LoopIdiomRecognize not working well - and one right after loop unrolling) would be too much of a problem.
@arielb1 Fair enough. I think it might be a good addition for C++/clang as well, but I haven't gotten the time to check. Some people (TM) are trying very hard upstream to make a mess of SimplifyCFG, but there's hope they'll fail, so I think your plan is reasonable. Can you add performance/compile time numbers after your experiments? Thanks!
cc: @alexcrichton
Also, I haven't looked at the IR produced by Rust very closely but I'd like to point out two things: 1) The current pass manager pipeline (both module and function) has been heavily optimized for C/C++. If Rust wants to propose an alternative, well, maybe we (LLVM) should take a look 2) There have been many discussions about the pipeline getting older and older (and therefore, less tuned). During the ThinLTO bringup people tuned it a bit, but the Module and the LTO pipeline are not necessarily the best (e.g. we run loop vectorizer and loop unroller both in the per-TU pipeline and in the LTO pipeline). In other words, if you're interested, this is something that needs some love.
@dcci heh I'd definitely beleive there's some possible improvements here! AFAIK we're using the "vanilla" pass manager builders in LLVM in the sense of we're at least trying to use the exact same optimization pipeline as C++ in clang. Our codegen I know historically in at least a few places has had to be altered to look more like C++ to get better optimized, but that sort of issue comes up relatively rarely.
It's of course possible to run instcombine + simplifycfg in a loop n times while inlining and that would presumably make the code faster, at some expense for compile time (in the std::vector<>::push_back example in the talk, that's quantifiable by the number of entries that can be simplified, but in general it should be a good win for some cases.
Would it make sense to apply iterate-to-fixpoint optimization passes on a subset of methods, hotspots found by profiling basically. Either via annotation similar to #[inline]
or as part of PGO.
@the8472 Most combinations of passes (including pretty basic combinations like SimplifyCFG+InstCombine, as @dcci mentioned in the post you quoted) don't generally converge to a fixed point when run together, i.e., on some code iterate-to-fixpoint would loop infinitely.
@rkruppe ok, if it does not reach a fixpoint, does it reach fairly small cycles? In that case a cycle finding algorithm using checkpoints could be used to stop the iteration. Additionally the idea would be to run this in combination with inlining (@dcci's push_back example), so a code size constraint could also be added as abort criteria.
That's a big if (and getting more unlikely as you add more passes), checkpoints will be tricky to implement and balloon compile time & peak rss even further, and the approach doesn't scale to larger pipelines.
I'm all but convinced that solving phase ordering issues completely can't be achieved by running classical LLVM passes to a fixed point, only by adopting a wholly different approach to optimization (e.g., equality saturation).
@rkruppe: It's pretty easy to show that iterating to a fixpoint is insufficient by showing that there are pairs of optimizations for which no ordering is optimal - (register allocation, instruction selection) is something of the "canonical" such pair, with the solution being to fuse the two into a single pass.
Alan Lawrence's thesis on optimizing using the Value-State Dependence Graph (VSDG) representation covers that pretty nicely, along with a variety of related issues.
In addition, because the VSDG eliminates control flow and non-causal ordering from the IR (then reintroduces it optimally during proceduralization and sequentialization), equality saturation performed on the VSDG may be more efficiently implementable (as those are entire classes of equivalences it won't need to analyze or represent).
@rkruppe yes, the compile times are an obvious concern, that's why I suggested it to only applying it to select functions, not globally. E.g. this comment on PGO mentions a bin crate having one hot code path they'd like to optimize.
I'm all but convinced that solving phase ordering issues completely can't be achieved by running classical LLVM passes to a fixed point
But one could still get an improvement over the current situation.
So if I read this correctly, is this issue the reason why code using Ord::cmp
currently compiles to an instruction sequence that first compares the values, conditionally moves -1/0/1 into a register and then compares this again?
shall this be marked solved now? the generated code with rustc 1.25.0-nightly (16362c737 2018-02-12):
define zeroext i1 @example::exists_in_table(i32 %v) unnamed_addr #0 !dbg !12 {
%0 = icmp eq i32 %v, 0, !dbg !14
ret i1 %0, !dbg !18
}
see https://godbolt.org/g/B9kkeV for diff. solved by #47828
Perhaps this is a different issue, but I'm still seeing std::cmp::Ordering
values being materialized to -1/0/1 for no reason: https://godbolt.org/g/mbQDZo
pub fn mytest(x: i32, y: i32, i: usize) -> usize {
let x = if x == y { ::std::process::exit(0) }
else if x < y { 0 }
else { 1 };
2 * (i + 1) - x
}
Comparing manually is fine and yields perfect code:
example::mytest:
cmp edi, esi
je .LBB0_1
setge al
add rdx, rdx
movzx eax, al
sub rdx, rax
add rdx, 2
mov rax, rdx
ret
.LBB0_1:
push rbp
mov rbp, rsp
xor edi, edi
call std::process::exit@PLT
ud2
Using Ord
(a zero-cost abstraction, one would think):
use std::cmp::Ordering::*;
pub fn mytest(x: i32, y: i32, i: usize) -> usize {
let x = match x.cmp(&y) {
Equal => ::std::process::exit(0),
Less => 0,
Greater => 1,
};
2 * (i + 1) - x
}
The Ordering
value is first materialized into rcx
for no reason, then the code branches on that:
example::mytest:
xor eax, eax
xor ecx, ecx
cmp edi, esi
setge cl
lea rcx, [rcx + rcx - 1]
cmove rcx, rax
cmp rcx, 1
je .LBB0_3
test rcx, rcx
je .LBB0_2
lea rax, [rax + 2*rdx]
add rax, 2
ret
.LBB0_3:
mov rax, -1
lea rax, [rax + 2*rdx]
add rax, 2
ret
.LBB0_2:
push rbp
mov rbp, rsp
xor edi, edi
call std::process::exit@PLT
ud2
This is especially problematic when implementing a generic data structure which has to use Ord
and can't simply fall back to an if-else chain (like https://github.com/main--/rust-eytzinger/blob/ea266a38e4c5e72aa0ab934a5bfef8e8fff3bcd8/src/lib.rs#L329-L339).
This issue is hardly actionable. We should make a tracking issue (possibly converting this to a tracking issue) and associate specific issues with the tracking issue.
I hope that answers questions posed in the two questions above.
Would this be the right place or has a separate issue been created regarding pass optimizations?
The latest example of this I've seen was posted by @Veedrac to Hacker News (try on playground):
After
-C opt-level=3
the ~600 lines of unoptimized IR turn into this (notebr i1 false
):