serguei-katkov commented 5 years ago


Bugzilla Link	42085
Resolution	FIXED
Resolved on	Feb 20, 2021 13:02
Version	trunk
OS	Windows NT
CC	@bmrzycki,@topperc,@efriedma-quic,@nikic

Extended Description

The following IR:

declare void @foo(i32)

define i32 @test(i32 %A, i32 %B, i32 %C) { entry: br label %header.1

header.1: %v.1.h = phi i32 [%A, %entry], [%v.h, %header] %w.1.h = phi i32 [%C, %entry], [%w.h, %header] br i1 false, label %header, label %bad

header: %v.h = phi i32 [%v.1.h, %header.1], [%v.inc, %cont] %w.h = phi i32 [%w.1.h, %header.1], [%v.inc, %cont] %cmp3 = icmp sgt i32 %w.h, 100 br i1 %cmp3, label %header.1, label %bad

bad: %v = phi i32 [%v.1.h, %header.1], [%v.h, %header] %w = phi i32 [%w.1.h, %header.1], [%w.h, %header] %v.inc = add i32 %v, 1 %cmp2 = icmp eq i32 %B, 0 br i1 %cmp2, label %exit2, label %loop

loop: call void @foo(i32 %v.inc) %cmp = icmp sgt i32 %v.inc, 170 br i1 %cmp, label %exit, label %cont

cont: br label %header

exit: ret i32 %w

exit2: ret i32 0

}

is incorrectly transformed

opt -passes=jump-threading ./test.ll -o test.opt.ll -S -debug-only=jump-threading Jump threading on function 'test' In block 'header.1' folding terminator: br i1 false, label %header, label %bad IN BB: bad: ; preds = %header.1, %header %v = phi i32 [ %v.1.h, %header.1 ], [ %v.inc, %header ] %w = phi i32 [ %w.1.h, %header.1 ], [ %v.inc, %header ] %v.inc = add i32 %v, 1 %cmp2 = icmp eq i32 %B, 0 br i1 %cmp2, label %exit2, label %loop BB 'bad': FOUND condition = i1 false for pred 'header'. Threading edge from 'header' to 'loop' with cost: 2, across block:

bad: ; preds = %header.1, %header %v = phi i32 [ %v.1.h, %header.1 ], [ %v.inc, %header ] %w = phi i32 [ %w.1.h, %header.1 ], [ %v.inc, %header ] %v.inc = add i32 %v, 1 %cmp2 = icmp eq i32 %B, 0 br i1 %cmp2, label %exit2, label %loop

JT: Renaming non-local uses of: %w = phi i32 [ %w.1.h, %header.1 ]

JT: Renaming non-local uses of: %v.inc = add i32 %v, 1

IN BB: loop: ; preds = %bad.thread, %bad %v.inc4 = phi i32 [ %v.inc1, %bad.thread ], [ %v.inc, %bad ] %w3 = phi i32 [ %v.inc1, %bad.thread ], [ %w.1.h, %bad ] call void @foo(i32 %v.inc4) %cmp = icmp sgt i32 %v.inc4, 170 br i1 %cmp, label %exit, label %header BB 'loop': FOUND condition = i1 false for pred 'bad.thread'. Not threading across block BB 'loop' to dest loop header BB 'header' - it might create an irreducible loop! IN BB: loop: ; preds = %bad.thread, %bad %v.inc4 = phi i32 [ %v.inc1, %bad.thread ], [ %v.inc, %bad ] %w3 = phi i32 [ %v.inc1, %bad.thread ], [ %w.1.h, %bad ] call void @foo(i32 %v.inc4) %cmp = icmp sgt i32 %v.inc4, 170 br i1 %cmp, label %exit, label %header BB 'loop': FOUND condition = i1 false for pred 'bad.thread'. Not threading across block BB 'loop' to dest loop header BB 'header' - it might create an irreducible loop!

to

; ModuleID = './test.ll' source_filename = "./test.ll"

declare void @foo(i32)

define i32 @test(i32 %A, i32 %B, i32 %C) { entry: br label %bad

header: ; preds = %loop %cmp3 = icmp sgt i32 %v.inc4, 100 br i1 %cmp3, label %bad, label %bad.thread

bad.thread: ; preds = %header %v.inc1 = add i32 %v.inc4, 1 br label %loop

bad: ; preds = %entry, %header %v.1.h = phi i32 [ %A, %entry ], [ %v.inc4, %header ] %w.1.h = phi i32 [ %C, %entry ], [ %v.inc4, %header ] %v.inc = add i32 %v.1.h, 1 %cmp2 = icmp eq i32 %B, 0 br i1 %cmp2, label %exit2, label %loop

loop: ; preds = %bad.thread, %bad %v.inc4 = phi i32 [ %v.inc1, %bad.thread ], [ %v.inc, %bad ] %w3 = phi i32 [ %v.inc1, %bad.thread ], [ %w.1.h, %bad ] call void @foo(i32 %v.inc4) %cmp = icmp sgt i32 %v.inc4, 170 br i1 %cmp, label %exit, label %header

exit: ; preds = %loop ret i32 %w3

exit2: ; preds = %bad ret i32 0 }

In thee orginal program in ret i32 %w %w is always less than %v.inc (which is an argument of call) by one.

In the result program return value is always equal to argument to call.

nikic commented 3 years ago

Per above comments, this issue has been fixed a while ago.

serguei-katkov commented 5 years ago

Hi Brian, Eli's patch fixes the original issue.

bmrzycki commented 5 years ago

Hi Serguei, can you please verify if Eli's patch in D63913 removes the issue on your original IR?

efriedma-quic commented 5 years ago

Posted https://reviews.llvm.org/D63913 .

bmrzycki commented 5 years ago

I have posted an updated patch on https://reviews.llvm.org/D63629 . Craig, when you have the time, I'd appreciate if you could review and provide feedback.

bmrzycki commented 5 years ago

Thank you for confirming Serguei. I'm running tests and final cleanup to the patch and will post for review in the community today if all goes well with testing.

serguei-katkov commented 5 years ago

Hi Brian, I confirmed that your last patch fixes my original case.

bmrzycki commented 5 years ago

Hello Serguei, I have a potential patch that fixes the new test case you posted. Before I update the Phabricator diff I'd appreciate if you can verify the problem is fixed on your original source input. The patch eliminates constant edges in the function F before calculating loop headers. This prevents false loop headers from being identified and (worse) the true loop headers being missed.

diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h index 0464d40c45e..20c02d701d8 100644 --- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h +++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h @@ -107,7 +107,7 @@ public: BPI.reset(); }

void FindLoopHeaders(Function &F);
bool FindLoopHeaders(Function &F); bool ProcessBlock(BasicBlock BB); bool ThreadEdge(BasicBlock BB, const SmallVectorImpl<BasicBlock > &PredBBs, BasicBlock SuccBB); diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index c80cdd08224..19ac1d3f8ad 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -363,14 +363,12 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo TLI_, SmallPtrSet<BasicBlock , 16> Unreachable; assert(DTU && "DTU isn't passed into JumpThreading before using it."); assert(DTU->hasDomTree() && "JumpThreading relies on DomTree to proceed.");
bool EverChanged = FindLoopHeaders(F); DominatorTree &DT = DTU->getDomTree(); for (auto &BB : F) if (!DT.isReachableFromEntry(&BB)) Unreachable.insert(&BB);
FindLoopHeaders(F);
bool EverChanged = false; bool Changed; do { Changed = false; @@ -540,12 +538,27 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB, /// within the loop (forming a nested loop). This simple analysis is not rich /// enough to track all of these properties and keep it up-to-date as the CFG /// mutates, so we don't allow any of these transformations. -void JumpThreadingPass::FindLoopHeaders(Function &F) { +bool JumpThreadingPass::FindLoopHeaders(Function &F) {
bool Changed = false;
// BMR ADD REASON HERE.
for (auto &BB : F) {
if (ConstantFoldTerminator(&BB, true, TLI, DTU)) {
++NumFolds;
Changed = true;
LLVM_DEBUG(dbgs() << " FindLoopHeaders, in block '" << BB.getName()
<< "' folded to new terminator: " << *BB.getTerminator()
<< '\n');
}
}
SmallVector<std::pair<const BasicBlock,const BasicBlock>, 32> Edges; FindFunctionBackedges(F, Edges);

for (const auto &Edge : Edges) LoopHeaders.insert(Edge.second);
return Changed; }

/// getKnownConstant - Helper method to determine if we can thread over a @@ -1100,11 +1113,16 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // other blocks. if (getKnownConstant(Condition, Preference)) { LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
<< "' folding terminator: " << *BB->getTerminator()
<< "' trying to fold terminator: " << *BB->getTerminator() << '\n');
++NumFolds;
ConstantFoldTerminator(BB, true, nullptr, DTU);
return true;
if (ConstantFoldTerminator(BB, true, TLI, DTU)) {
++NumFolds;
LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
<< "' folded to new terminator: "
<< *BB->getTerminator() << '\n');
return true;
}
return false; }

Instruction *CondInst = dyn_cast(Condition);

bmrzycki commented 5 years ago

Hi Serquei, I've reduced your new case and renamed a few things to make it easier to trace:

define i32 @test(i1 %ARG1, i1 %ARG2) { entry: br label %pre_head

pre_head: %tmp = phi i32 [ 0, %entry ], [ %tmp10, %body1 ] %tmp3 = phi i32 [ 0, %entry ], [ %tmp11, %body1 ] %tmp4 = phi i32 [ 0, %entry ], [ %tmp11, %body1 ] br label %head1

head1: br i1 %ARG1, label %exit, label %head2

head2: br i1 false, label %head3, label %body2

head3: %tmp8 = add i32 %tmp3, 1 br label %body1

body1: %tmp10 = phi i32 [ %tmp, %head3 ], [ %tmp16, %latch2 ] %tmp11 = phi i32 [ %tmp8, %head3 ], [ %tmp16, %latch2 ] %tmp12 = icmp sgt i32 %tmp11, 1 br i1 %tmp12, label %body2, label %pre_head

body2: %tmp14 = phi i32 [ %tmp10, %body1 ], [ %tmp, %head2 ] %tmp15 = phi i32 [ %tmp11, %body1 ], [ %tmp3, %head2 ] %tmp16 = add i32 %tmp14, 1 br i1 %ARG2, label %exit, label %latch1

latch1: %tmp18 = icmp sgt i32 %tmp16, 2 br i1 %tmp18, label %exit, label %latch2

latch2: br label %body1

exit: %rc = phi i32 [ %tmp15, %body2 ], [ %tmp15, %latch1 ], [ -1, %head1 ] ret i32 %rc }

This is actually the same problem at its core. The FindFunctionBackedges() routine is unable to detect the true loop headers because of fake edges that don't actually exist in the IR. If I make the following patch:

--- ../pr42085.ll 2019-06-21 12:44:37.059215250 -0500 +++ pr42085.ll 2019-06-21 12:44:48.767165622 -0500 @@ -12,7 +12,7 @@ br i1 %ARG1, label %exit, label %head2

head2:

br i1 false, label %head3, label %body2
br label %body2

head3: %tmp8 = add i32 %tmp3, 1

The problem goes away again. This allows LoopHeaders to be correctly identified. I think the only way to fix this is to fully normalize loops before jumpthreading, or to attempt to scrub the IR a bit before calculating loopheaders. I'll have to examine my options here.

One idea I have is to remove all constant branches before calculating loopheaders, but I'm not sure what problems this may cause early on in the pass.

bmrzycki commented 5 years ago

Sergeui, thank you for reducing with patch 1, that is what I was going to suggest. https://reviews.llvm.org/D63629 is still a valid issue and I will probably have better luck in the community pushing that patch than a complete rebuilding of loopheaders within the pass.

serguei-katkov commented 5 years ago

Hi Brian, the new reproducer fails with patch #1.

; ModuleID = 'test.ll' target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" target triple = "x86_64-unknown-linux-gnu"

define i32 @test(i1 %arg, i1 %arg1) { bb: br label %bb2

bb2: ; preds = %bb9, %bb %tmp = phi i32 [ 0, %bb ], [ %tmp10, %bb9 ] %tmp3 = phi i32 [ 0, %bb ], [ %tmp11, %bb9 ] %tmp4 = phi i32 [ 0, %bb ], [ %tmp11, %bb9 ] br label %bb5

bb5: ; preds = %bb2 br i1 %arg, label %bb21, label %bb6

bb6: ; preds = %bb5 br i1 false, label %bb7, label %bb13

bb7: ; preds = %bb6 %tmp8 = add i32 %tmp3, 1 br label %bb9

bb9: ; preds = %bb20, %bb7 %tmp10 = phi i32 [ %tmp, %bb7 ], [ %tmp16, %bb20 ] %tmp11 = phi i32 [ %tmp8, %bb7 ], [ %tmp16, %bb20 ] %tmp12 = icmp sgt i32 %tmp11, 72 br i1 %tmp12, label %bb13, label %bb2

bb13: ; preds = %bb9, %bb6 %tmp14 = phi i32 [ %tmp10, %bb9 ], [ %tmp, %bb6 ] %tmp15 = phi i32 [ %tmp11, %bb9 ], [ %tmp3, %bb6 ] %tmp16 = add i32 %tmp14, 1 br i1 %arg1, label %bb22, label %bb17

bb17: ; preds = %bb13 %tmp18 = icmp sgt i32 %tmp16, 91 br i1 %tmp18, label %bb19, label %bb20

bb19: ; preds = %bb17 ret i32 %tmp15

bb20: ; preds = %bb17 br label %bb9

bb21: ; preds = %bb5 ret i32 -1

bb22: ; preds = %bb13 ret i32 -1 }

serguei-katkov commented 5 years ago

Unfortunately the second patch does not fix the problem as well.

Do you prefer the new reproducer for the first patch or second? I'm not sure whether it will be the same or not.

bmrzycki commented 5 years ago

Hello Seguei, I have an alternate patch that is more aggressive that forces a rebuild of the loopheaders if we encounter the constant fold condition. I didn't post it originally because I consider it more expensive but would be interested to know if it fixes your original case.

diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 7cb955d03ff..7a97c59c874 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1100,11 +1100,31 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // other blocks. if (getKnownConstant(Condition, Preference)) { LLVM_DEBUG(dbgs() << " In block '" << BB->getName()

<< "' folding terminator: " << *BB->getTerminator()
<< "' trying to fold terminator: " << *BB->getTerminator() << '\n');
++NumFolds;
ConstantFoldTerminator(BB, true, nullptr, DTU);
return true;
bool TIFolded = ConstantFoldTerminator(BB, true, nullptr, DTU);
if (TIFolded) {
++NumFolds;
LLVM_DEBUG(dbgs() << " Successfully folded block '" << BB->getName()
<< "' to new terminator: " << *BB->getTerminator()
<< '\n');
if (LoopHeaders.count(BB)) {
// LoopHeaders are detected at the start of JumpThreading before any
// changes to the IR occur. The FindFunctionBackedges() routine analyzes
// the edge state without comprehending if an edge is valid.
// We now have a case where LoopHeaders may, or may not be a valid loop
// header. It's also possible the successor of BB is one. All of these
// checks are non-trivial without LoopInfo. The safest thing to do is
// to recalculate LoopHeaders in this corner case. If we get this wrong
// we may miscompile by threading over an unidentified loop header.
LLVM_DEBUG(
dbgs() << " Folded block '" << BB->getName()
<< "' is a loop header. Recalculating all loop headers.\n");
LoopHeaders.clear();
FindLoopHeaders(*(BB->getParent()));
}
}
return TIFolded; }

Instruction *CondInst = dyn_cast(Condition);

If that doesn't work we will need a more comprehensive reduced testcase.

serguei-katkov commented 5 years ago

Hi Brian, unfortunately the uploaded potential fix does not help to original reproducer from which I reduced the test case.

I'll double check and try to reduce it again with you fix applied to narrow down the next problem... I will keep you informed.

bmrzycki commented 5 years ago

Posted potential fix https://reviews.llvm.org/D63629 for review.

bmrzycki commented 5 years ago

I have reduced down the problem to what I believe is the minimal IR shape:

define i32 @test(i1 %ARG) { entry: br label %pre_body

head: %inc_head = phi i32 [ %inc_pre_body, %pre_body ], [ %inc, %latch ] %cmp1 = icmp sgt i32 %inc_head, 1 br i1 %cmp1, label %body, label %pre_body

pre_body: %inc_pre_body = phi i32 [ 1, %entry ], [ %inc_head, %head ] br i1 false, label %head, label %body

body: %res = phi i32 [ %inc_head, %head ], [ %inc_pre_body, %pre_body ] %tmp = phi i32 [ %inc_head, %head ], [ %inc_pre_body, %pre_body ] %inc = add i32 %tmp, 1 br i1 %ARG, label %exit, label %latch

latch: %cmp2 = icmp sgt i32 %inc, 2 br i1 %cmp2, label %exit, label %head

exit: %rc = phi i32 [ %res, %latch ], [ 0, %body ] ret i32 %rc }

JumpThreading threads: head -> body -> latch to: head -> body.thread -> latch

The reason this is a miscompile is body is actually a loop header, but the existing analysis in JumpThreading does not recognize it as one.

There are three issues that cause the miscompile to happen. If any one of these are altered the problem does not arise:

The non-natural loop structure.
The TI in pre_body is not a conditional; it's actually unconditional.
ARG is constant in body only in nodes fully dominated by it.

Before any of the block iteration JT calls JumpThreading::FindLoopHeaders() which calls FindFunctionBackedges(F, Edges). The second function is in CFG.cpp and is meant to be a very simple and fast method of finding loop back edges without LI or DT. It does so by only examining BBs as opaque objects with incoming and outgoing edges. This causes it to incorrectly identify the LoopHeaders of this IR, caused by #1 and #2 above.

In the failing case FindFunctionBackedges() identifies the following two Edge pairs, and JT stores the second value of each pair as a LoopHeader: (latch, head) (head, pre_body)

However, if we make the following single change to the IR before JT is called:

$ diff -u pr42085.ll pr42085_good.ll --- pr42085.ll 2019-06-19 14:40:44.548986379 -0500 +++ pr42085_good.ll 2019-06-19 14:40:03.657152878 -0500 @@ -3,13 +3,13 @@ br label %pre_body

head:

%inc_head = phi i32 [ %inc_pre_body, %pre_body ], [ %inc, %latch ]
%inc_head = phi i32 [ %inc, %latch ] %cmp1 = icmp sgt i32 %inc_head, 1 br i1 %cmp1, label %body, label %pre_body

pre_body: %inc_pre_body = phi i32 [ 1, %entry ], [ %inc_head, %head ]
br i1 false, label %head, label %body
br label %body

body: %res = phi i32 [ %inc_head, %head ], [ %inc_pre_body, %pre_body ]

Then FindFunctionBackEdges() identifies the following two Edge pairs: (head, body) (head, pre_body)

This is significant because the second check of JumpThreading::ThreadEdge() is the following:

// If threading this would thread across a loop header, don't thread the edge. // See the comments above FindLoopHeaders for justifications and caveats. if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) { LLVM_DEBUG({ bool BBIsHeader = LoopHeaders.count(BB); bool SuccIsHeader = LoopHeaders.count(SuccBB); dbgs() << " Not threading across " << (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName() << "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '") << SuccBB->getName() << "' - it might create an irreducible loop!\n"; }); return false; }

And causes BB (which is 'body') to be discarded as a thread candidate.

Finally, the reason why #3 above contributes to the problem is because it's only constant in a block that's inside the loop nest, causing JumpThreading to identify threading opportunities on blocks which may be back edges. A different optimization pass, like SimplyCfg correctly identifies this as constant across the entire Function and hoists it to the beginning, which is far more profitable than threading across it later.

I may have a fix, but I am unsure if the condition holds in all cases and need to test it. The idea in pseudo-code:

if bb_has_conditional and bb_conditional_is_convertable_to_unconditional: bb_convert_conditional_to_unconditional() if bb_is_loop_header: bb_remove_from_loop_headers() succbb = bb.succ() succbb_add_to_loop_headers()

I need to identify, and convert, all locations where this happens in JumpThreading.

serguei-katkov commented 5 years ago

Thank you for explanation.

bmrzycki commented 5 years ago

Hi Serguei, Let me first be clear:

--> opt _mustnever emit miscompiled code <--

With that said, we have two big problems in JumpThreading today regarding loops. The first is we do not have LoopInfo (LI) available to us in the pass. The second problem, a much more difficult one, is that JumpThreading was never designed to preserve loops and loop hierarchies. The code in ProcessBlock() alters the blocks in F very aggressively, in the hopes of finding the ideal case of PredBB -> BB -> DestBB to thread. Many of these early changes to the IR will have to be changed to properly preserve the LoopInfo during the lifetime of the JumpThreading transform pass. This is what I was referring to in my previous post as a big undertaking. I actually consider this more of a feature enhancement than a bugfix.

Right now I'm looking to see if it's possible to pessimize this thread in some way with the existing analysis without pessimizing other, valid, threads. If this looks to be unlikely I may have to use an assert().

The decision of the final right answer is up to the LLVM community as a whole. This may take time to resolve and if you need a workaround during this time I recommend avoiding the generation of non-natural loop structures. This is probably one of the weakest areas for the existing JumpThreading pass.

serguei-katkov commented 5 years ago

I'm sorry I tend to disagree here. LLVM is not only for Clang and any pass is intended to work correctly in case the input is correct.

Adding the guards/asserts is a right way to protect uncovered cases in a pass.

bmrzycki commented 5 years ago

I've re-written the new v2 test-case IR with names that should be more helpful:

define i32 @test(i1 %ARG) { entry: br label %h1

h1: %phi_h1 = phi i32 [ 0, %entry ], [ %phi_h2, %h2 ] br i1 false, label %h2, label %body

h2: %phi_h2 = phi i32 [ %phi_h1, %h1 ], [ %inc, %latch ] %cmp_h2 = icmp sgt i32 %phi_h2, 4 br i1 %cmp_h2, label %body, label %h1

body: %res = phi i32 [ %phi_h2, %h2 ], [ %phi_h1, %h1 ] %phi_body = phi i32 [ %phi_h2, %h2 ], [ %phi_h1, %h1 ] %inc = add i32 %phi_body, 1 br i1 %ARG, label %body_exit, label %latch

body_exit: ret i32 -1

latch: %cmp_latch = icmp sgt i32 %inc, 5 br i1 %cmp_latch, label %latch_exit, label %h2

latch_exit: ret i32 %res }

And when running it through the go.sh test framework I see the following:

$ ./go.sh using clang='/work/b.rzycki/upstream/install/bin/clang' using llc='/work/b.rzycki/upstream/install/bin/llc' using opt='/work/b.rzycki/upstream/install/bin/opt'

[ INPUT ==> arg=0 ] [runme_pr ] test == 5 [runme_jt ] test == 6

I now understand why this code is producing different values before and after JumpThreading. To help visualize I've attached an image of the IR graph.

The core of the problem is two non-natural loops jumping between each other. The first loop is between h1 and h2. The second one is between h2, body, latch. The block h2 lives in both loops. The second problem is the unusual terminator instruction of body: being a global exit.

In the case of the incorrectly threaded edge we start off in ProcessBlock() examining BB == body. We eventually enter ProcessThreadableEdges(). In the case of potential path h2 -> body -> latch we examine the terminating instruction compare in body. This is a non-natural comparison of the global ARG instead of checking %inc for what is usually loop continuation. That check is actually in BB latch's terminating instruction. We then call ThreadEdge().

The current set of loop analysis checks are insufficient for this kind of IR. The only guards JumpThreading uses is basic checks for LoopHeaders. JT has no concept of Loop nest levels or non-natural loops shapes.

I'm unsure if makes sense to add the defensive programming to JT for this kind of IR given that the rest of Clang passes IR of a more standard shape in. It would only really benefit those who use optimization passes in complete isolation. My concern is the added checks add code complexity and increase compile time for a fairly uncommon issue.

For example, this compilation problem goes away if we simplify the CFG before calling JumpThreading:

b.rzycki@cc02 /work/b.rzycki/pr42085v2 $ grep opt go.sh opt="$cbase/opt" echo "change variable 'cbase' to the location of clang, opt, llc" echo "using opt='$opt'" "$opt" -passes=simplify-cfg,jump-threading -S -o jt.ll "$pr" b.rzycki@cc02 /work/b.rzycki/pr42085v2 $ ./go.sh using clang='/work/b.rzycki/upstream/install/bin/clang' using llc='/work/b.rzycki/upstream/install/bin/llc' using opt='/work/b.rzycki/upstream/install/bin/opt'

[ INPUT ==> arg=0 ] [runme_pr ] test == 5 [runme_jt ] test == 5

bmrzycki commented 5 years ago

v2 testcase visualized (dotty)

bmrzycki commented 5 years ago

testcase framework version 2

bmrzycki commented 5 years ago

Hi Serguei, thank you for verifying and rebuilding your test case. No worries about the mistake, this stuff is tricky. :)

I have verified your results with a new framework which I will attach to this ticket. Here's the output:

b.rzycki@cc02 ~/pr42085v2 $ ./go.sh clean b.rzycki@cc02 ~/pr42085v2 $ ./go.sh using clang='/work/b.rzycki/upstream/install/bin/clang' using llc='/work/b.rzycki/upstream/install/bin/llc' using opt='/work/b.rzycki/upstream/install/bin/opt'

[ INPUT ==> arg=0 ] [runme_pr ] test == 91 [runme_jt ] test == 92

[ INPUT ==> arg=1 ] [runme_pr ] test == -1 [runme_jt ] test == -1

I'll take a look at the threaded case in GDB to see why this happened.

serguei-katkov commented 5 years ago

And again, sorry for the wring initial reproducer :(

serguei-katkov commented 5 years ago

ok, your harness appeared to be simple to modify and use (thanks for this).

After some modifications related to argument passing I've got: test == 91 test == 92 where the first line for the good IR and the second one for IR processed with jump-threading.

serguei-katkov commented 5 years ago

I believe this repro should show the bug.

I did not check it through your harness yet. Will do it tomorrow.

; ModuleID = 'test.ll' source_filename = "test.ll" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" target triple = "x86_64-unknown-linux-gnu"

define i32 @test(i1 %arg) { bb: br label %bb1

bb1: ; preds = %bb3, %bb %tmp = phi i32 [ 0, %bb ], [ %tmp4, %bb3 ] br label %bb2

bb2: ; preds = %bb1 br i1 false, label %bb3, label %bb6

bb3: ; preds = %bb13, %bb2 %tmp4 = phi i32 [ %tmp, %bb2 ], [ %tmp9, %bb13 ] %tmp5 = icmp sgt i32 %tmp4, 72 br i1 %tmp5, label %bb6, label %bb1

bb6: ; preds = %bb3, %bb2 %tmp7 = phi i32 [ %tmp4, %bb3 ], [ %tmp, %bb2 ] %tmp8 = phi i32 [ %tmp4, %bb3 ], [ %tmp, %bb2 ] %tmp9 = add i32 %tmp7, 1 br i1 %arg, label %bb14, label %bb10

bb10: ; preds = %bb6 %tmp11 = icmp sgt i32 %tmp9, 91 br i1 %tmp11, label %bb12, label %bb13

bb12: ; preds = %bb10 ret i32 %tmp8

bb13: ; preds = %bb10 br label %bb3

bb14: ; preds = %bb6 ret i32 -1 }

After opt test.ll -S -o test.opt.ll -passes=jump-threading the result is

; ModuleID = 'test.ll' source_filename = "test.ll" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" target triple = "x86_64-unknown-linux-gnu"

define i32 @test(i1 %arg) { bb: br label %bb6

bb3: ; preds = %bb10 %tmp5 = icmp sgt i32 %tmp93, 72 br i1 %tmp5, label %bb6.thread, label %bb6

bb6.thread: ; preds = %bb3 %tmp91 = add i32 %tmp93, 1 br label %bb10

bb6: ; preds = %bb3, %bb %tmp = phi i32 [ 0, %bb ], [ %tmp93, %bb3 ] %tmp9 = add i32 %tmp, 1 br i1 %arg, label %bb14, label %bb10

bb10: ; preds = %bb6.thread, %bb6 %tmp93 = phi i32 [ %tmp91, %bb6.thread ], [ %tmp9, %bb6 ] %tmp82 = phi i32 [ %tmp91, %bb6.thread ], [ %tmp, %bb6 ] %tmp11 = icmp sgt i32 %tmp93, 91 br i1 %tmp11, label %bb12, label %bb3

bb12: ; preds = %bb10 ret i32 %tmp82

bb14: ; preds = %bb6 ret i32 -1 }

serguei-katkov commented 5 years ago

Hmmm, I really did smth wrong while reducing the original test to test case... Sorry for that. I'll double check to fix the test.

Thank you for looking into this...

bmrzycki commented 5 years ago

testcase framework

bmrzycki commented 5 years ago

Serguei, I am having trouble verifying this as a miscompile. I'm attaching a test harness that does the following:

uses a custom main.c to call test()
builds the good (no jumpthread) and bad (jumpthreded) IR with llc.
creates two binaries runme_good and runme_bad to compare output values can calls to foo().
Runs both binaries with inputs I hope are representative of interesting inputs for A, B, and C.

Here is the result of me running it:

b.rzycki@cc ~/pr42085 $ ./go.sh clean b.rzycki@cc ~/pr42085 $ ./go.sh using clang='/work/b.rzycki/upstream/install/bin/clang' using llc='/work/b.rzycki/upstream/install/bin/llc'

[ INPUTS ==> A=0 B=0 C=0 ] [runme_good] test == 0 [runme_bad ] test == 0

[ INPUTS ==> A=100 B=0 C=99 ] [runme_good] test == 0 [runme_bad ] test == 0

[ INPUTS ==> A=100 B=1 C=100 ] [runme_good] test == 170 [runme_bad ] test == 170

[ INPUTS ==> A=102 B=17 C=200 ] [runme_good] test == 170 [runme_bad ] test == 170

[ INPUTS ==> A=169 B=3 C=100 ] [runme_good] test == 170 [runme_bad ] test == 170

[ INPUTS ==> A=170 B=4 C=101 ] [runme_good] test == 101 [runme_bad ] test == 101

[ INPUTS ==> A=171 B=5 C=102 ] [runme_good] test == 102 [runme_bad ] test == 102

[ INPUTS ==> A=-75 B=-12 C=-103 ] [runme_good] test == 170 [runme_bad ] test == 170

You can also see the outputs of all the foo() calls if you use "./go.sh debug". I didn't show that here for the sake of clarity.

If you can help show where it breaks I will use that case to debug this further.

By the way, the above test result came from using an x86_64 with the Git monorepo of llvm-project, SHA c9e3dbb0a51c19b3a65a73962ebbf264c94d8902 from Fri Jun 14 2019.

bmrzycki commented 5 years ago

Ah, thanks Craig! I wasn't aware that wraparound was included. That makes sense now and the values look correct for LVI.

topperc commented 5 years ago

I think constant ranges are inclusive of the start and exclusive of the end and wrap around. So 0,1 is [0,1) which is just 0. 1,0 is [1,0) which is everything but 0.

bmrzycki commented 5 years ago

Craig, thanks for the correction. I was staring at it for hours yesterday and had tunnel vision on the ConstantRange value of B in loop, header, and exit.

I'm still a bit confused by this. For loop, header, and exit all we know is B cannot be zero and in exit2 B must equal zero. These LatticeValues don't seem to represent that. I am a bit new to the nomenclature so I may be misinterpreting their meaning.

However, the more I think about this the more I suspect this is likely not root cause. It is valid to state the comparison of B to 0 only needs to be done once and to bypass (thread) that check for subsequent calls.

I'm going to analyze the bad IR more to determine exactly why it's invalid and what check was missed.

topperc commented 5 years ago

Isn't LVI showing the exit2 range for B to be <0,1>. While the other blocks say <1,0>.

bmrzycki commented 5 years ago

For some reason LVI believes B to be constant on a few edges. I've tried flushing LVI with clear() to make sure it's not a bad update from JumpThreading and I see the same result. LVI believes B to be in the range <1, 0> in header, bad, loop, exit, and exit2. I'm still working with debug statements to determine why LVI believes this to be the case. Here's a dump from printLVI() showing the analysis inline in test().

define i32 @test(i32 %A, i32 %B, i32 %C) { entry: ; LatticeVal for: 'i32 %A' is: overdefined ; LatticeVal for: 'i32 %B' is: overdefined ; LatticeVal for: 'i32 %C' is: overdefined ; LatticeVal for: ' br label %header.1' in BB: '%entry' is: overdefined ; LatticeVal for: ' br label %header.1' in BB: '%header.1' is: overdefined br label %header.1

header.1: ; preds = %header, %entry ; LatticeVal for: 'i32 %A' is: overdefined ; LatticeVal for: 'i32 %B' is: overdefined ; LatticeVal for: 'i32 %C' is: overdefined ; LatticeVal for: ' %v.1.h = phi i32 [ %A, %entry ], [ %v.inc, %header ]' in BB: '%header.1' is: overdefined ; LatticeVal for: ' %v.1.h = phi i32 [ %A, %entry ], [ %v.inc, %header ]' in BB: '%bad' is: overdefined %v.1.h = phi i32 [ %A, %entry ], [ %v.inc, %header ] ; LatticeVal for: ' %w.1.h = phi i32 [ %C, %entry ], [ %v.inc, %header ]' in BB: '%header.1' is: overdefined ; LatticeVal for: ' %w.1.h = phi i32 [ %C, %entry ], [ %v.inc, %header ]' in BB: '%bad' is: overdefined %w.1.h = phi i32 [ %C, %entry ], [ %v.inc, %header ] ; LatticeVal for: ' br label %bad' in BB: '%header.1' is: overdefined ; LatticeVal for: ' br label %bad' in BB: '%bad' is: overdefined br label %bad

header: ; preds = %loop ; LatticeVal for: 'i32 %A' is: overdefined ; LatticeVal for: 'i32 %B' is: constantrange<1, 0> ; LatticeVal for: 'i32 %C' is: overdefined ; LatticeVal for: ' %cmp3 = icmp sgt i32 %v.inc, 100' in BB: '%header' is: overdefined %cmp3 = icmp sgt i32 %v.inc, 100 ; LatticeVal for: ' br i1 %cmp3, label %header.1, label %bad' in BB: '%header' is: overdefined br i1 %cmp3, label %header.1, label %bad

bad: ; preds = %header.1, %header ; LatticeVal for: 'i32 %A' is: overdefined ; LatticeVal for: 'i32 %B' is: overdefined ; LatticeVal for: 'i32 %C' is: overdefined ; LatticeVal for: ' %v = phi i32 [ %v.1.h, %header.1 ], [ %v.inc, %header ]' in BB: '%bad' is: overdefined ; LatticeVal for: ' %v = phi i32 [ %v.1.h, %header.1 ], [ %v.inc, %header ]' in BB: '%exit2' is: overdefined ; LatticeVal for: ' %v = phi i32 [ %v.1.h, %header.1 ], [ %v.inc, %header ]' in BB: '%loop' is: overdefined %v = phi i32 [ %v.1.h, %header.1 ], [ %v.inc, %header ] ; LatticeVal for: ' %w = phi i32 [ %w.1.h, %header.1 ], [ %v.inc, %header ]' in BB: '%bad' is: overdefined ; LatticeVal for: ' %w = phi i32 [ %w.1.h, %header.1 ], [ %v.inc, %header ]' in BB: '%exit2' is: overdefined ; LatticeVal for: ' %w = phi i32 [ %w.1.h, %header.1 ], [ %v.inc, %header ]' in BB: '%loop' is: overdefined ; LatticeVal for: ' %w = phi i32 [ %w.1.h, %header.1 ], [ %v.inc, %header ]' in BB: '%exit' is: overdefined %w = phi i32 [ %w.1.h, %header.1 ], [ %v.inc, %header ] ; LatticeVal for: ' %v.inc = add i32 %v, 1' in BB: '%bad' is: constantrange<-1, -1> ; LatticeVal for: ' %v.inc = add i32 %v, 1' in BB: '%exit2' is: constantrange<-1, -1> ; LatticeVal for: ' %v.inc = add i32 %v, 1' in BB: '%loop' is: constantrange<-1, -1> ; LatticeVal for: ' %v.inc = add i32 %v, 1' in BB: '%header' is: constantrange<-2147483648, 171> %v.inc = add i32 %v, 1 ; LatticeVal for: ' %cmp2 = icmp eq i32 %B, 0' in BB: '%bad' is: overdefined ; LatticeVal for: ' %cmp2 = icmp eq i32 %B, 0' in BB: '%exit2' is: constantrange<-1, 0> ; LatticeVal for: ' %cmp2 = icmp eq i32 %B, 0' in BB: '%loop' is: constantrange<0, -1> %cmp2 = icmp eq i32 %B, 0 ; LatticeVal for: ' br i1 %cmp2, label %exit2, label %loop' in BB: '%bad' is: overdefined ; LatticeVal for: ' br i1 %cmp2, label %exit2, label %loop' in BB: '%exit2' is: overdefined ; LatticeVal for: ' br i1 %cmp2, label %exit2, label %loop' in BB: '%loop' is: overdefined br i1 %cmp2, label %exit2, label %loop

loop: ; preds = %bad ; LatticeVal for: 'i32 %A' is: overdefined ; LatticeVal for: 'i32 %B' is: constantrange<1, 0> ; LatticeVal for: 'i32 %C' is: overdefined ; LatticeVal for: ' call void @foo(i32 %v.inc)' in BB: '%loop' is: overdefined ; LatticeVal for: ' call void @foo(i32 %v.inc)' in BB: '%exit' is: overdefined ; LatticeVal for: ' call void @foo(i32 %v.inc)' in BB: '%header' is: overdefined call void @foo(i32 %v.inc) ; LatticeVal for: ' %cmp = icmp sgt i32 %v.inc, 170' in BB: '%loop' is: overdefined ; LatticeVal for: ' %cmp = icmp sgt i32 %v.inc, 170' in BB: '%exit' is: constantrange<-1, 0> ; LatticeVal for: ' %cmp = icmp sgt i32 %v.inc, 170' in BB: '%header' is: constantrange<0, -1> %cmp = icmp sgt i32 %v.inc, 170 ; LatticeVal for: ' br i1 %cmp, label %exit, label %header' in BB: '%loop' is: overdefined ; LatticeVal for: ' br i1 %cmp, label %exit, label %header' in BB: '%exit' is: overdefined ; LatticeVal for: ' br i1 %cmp, label %exit, label %header' in BB: '%header' is: overdefined br i1 %cmp, label %exit, label %header

exit: ; preds = %loop ; LatticeVal for: 'i32 %A' is: overdefined ; LatticeVal for: 'i32 %B' is: constantrange<1, 0> ; LatticeVal for: 'i32 %C' is: overdefined ; LatticeVal for: ' ret i32 %w' in BB: '%exit' is: overdefined ret i32 %w

exit2: ; preds = %bad ; LatticeVal for: 'i32 %A' is: overdefined ; LatticeVal for: 'i32 %B' is: constantrange<0, 1> ; LatticeVal for: 'i32 %C' is: overdefined ; LatticeVal for: ' ret i32 0' in BB: '%exit2' is: overdefined ret i32 0 }

bmrzycki commented 5 years ago

There is only one Threaded edge for the failing IR. During analysis in ProcessBlock on "bad" BB we eventually call omputeValueKnownInPredecessorsImpl(). Things look odd here.

; In function ComputeValueKnownInPredecessorsImpl() header: ; preds = %loop %cmp3 = icmp sgt i32 %v.inc, 100 br i1 %cmp3, label %header.1, label %bad

bad: ; preds = %header.1, %header %v = phi i32 [ %v.1.h, %header.1 ], [ %v.inc, %header ] %w = phi i32 [ %w.1.h, %header.1 ], [ %v.inc, %header ] %v.inc = add i32 %v, 1 %cmp2 = icmp eq i32 %B, 0 br i1 %cmp2, label %exit2, label %loop

; DTU->hasPendingDomTreeUpdates() == true ; Pred -> llvm::CmpInst::ICMP_EQ ; CmpLHS -> i32 %B ; CmpConst -> i32 0 ; P -> header: ; BB -> bad: ... ; CxtI -> br i1 %cmp2, label %exit2, label %loop Res = LVI->getPredicateOnEdge(Pred, CmpLHS, CmpConst, P, BB, CxtI ? CxtI : Cmp); ; Res == llvm::LazyValueInfo::False -> INVALID! B is unaltered and passed in as a function parameter!

The thing I don't understand is why the CxtI instruction is not the same as the CmpLHS and CmpRHS values. I'm trying to figure out how LVI was called with these exact parameters.

bmrzycki commented 5 years ago

This is another case where manually pushing IR through JumpThreading causes a failure while pushing it through clang++ -O3 ... does not.

I believe C code equivalent of the IR is the following:

extern void foo(int);

int test(int A, int B, int C) { if (B == 0) return 0;

A++; foo(A); if (A > 170) return C;

while (1) { int inc = A + 1; foo(inc); if (inc > 170) return A; A = inc; } }

Which is what I see when I push the original IR through clang++ -O3 ... (with a bit of cleanup on names for clarity):

declare void @foo(i32)

define i32 @test(i32 %A, i32 %B, i32 %C) { entry: %cmp_entry = icmp eq i32 %B, 0 br i1 %cmp_entry, label %exit, label %preheader

preheader: %inc_preheader = add i32 %A, 1 tail call void @foo(i32 %inc_preheader) %cmp_preheader = icmp sgt i32 %inc_preheader, 170 br i1 %cmp_preheader, label %exit, label %loop

loop: %inc_loop_phi = phi i32 [ %inc_loop, %loop ], [ %inc_preheader, %preheader ] %inc_loop = add i32 %inc_loop_phi, 1 tail call void @foo(i32 %inc_loop) %cmp39 = icmp sgt i32 %inc_loop, 170 br i1 %cmp39, label %exit, label %loop

exit: %merge = phi i32 [ 0, %entry ], [ %C, %preheader ], [ %inc_loop_phi, %loop ] ret i32 %merge }

If I dump the module at the very beginning of JumpThreading when called from clang and not opt, I see the following, subtly different, IR:

declare void @foo(i32) local_unnamed_addr

define i32 @test(i32 %A, i32 %B, i32 %C) local_unnamed_addr { entry: br label %header.1

header.1: %v.1.h = phi i32 [ %A, %entry ], [ %v.inc, %cont ] %w.1.h = phi i32 [ %C, %entry ], [ %v.inc, %cont ] br label %bad

bad: %v = phi i32 [ %v.1.h, %header.1 ], [ %v.inc, %cont ] %w = phi i32 [ %w.1.h, %header.1 ], [ %v.inc, %cont ] %v.inc = add i32 %v, 1 %cmp2 = icmp eq i32 %B, 0 br i1 %cmp2, label %exit, label %loop

loop: call void @foo(i32 %v.inc) %cmp = icmp sgt i32 %v.inc, 170 br i1 %cmp, label %exit, label %cont

cont: %cmp3 = icmp sgt i32 %v.inc, 100 br i1 %cmp3, label %header.1, label %bad

exit: %merge = phi i32 [ %w, %loop ], [ 0, %bad ] ret i32 %merge }

llvm / llvm-project

JumpThreading: invalid transformation #41430

Extended Description

The following IR:

}

to

exit2: ; preds = %bad ret i32 0 }

Hi Brian, the new reproducer fails with patch #1.

llvm / llvm-project

JumpThreading: invalid transformation #41430

Extended Description

The following IR:

}

to

exit2: ; preds = %bad ret i32 0 }

Hi Brian, the new reproducer fails with patch #​1.

Hi Brian, the new reproducer fails with patch #1.