Quuxplusone / LLVMBugzillaTest

0 stars 0 forks source link

Vector and scalar loop being dead code #25704

Open Quuxplusone opened 8 years ago

Quuxplusone commented 8 years ago
Bugzilla Link PR25705
Status CONFIRMED
Importance P normal
Reported by Chris Jenneisch (chrisj@codeaurora.org)
Reported on 2015-12-02 03:55:24 -0800
Last modified on 2016-01-18 16:50:45 -0800
Version unspecified
Hardware PC Windows NT
CC jdoerfert@anl.gov, johannes@jdoerfert.de, llvm-bugs@lists.llvm.org
Fixed by commit(s)
Attachments test.ll (3586 bytes, application/octet-stream)
Blocks
Blocked by
See also
Created attachment 15379
reduced test

opt  -polly-process-unprofitable -polly-code-generator=isl -polly-scops -polly-
codegen  test.ll -S

Both the scalar version and vector loop body polly.stmt.for.body end up being
dead code,

polly.preload.begin:
  br i1 true, label %polly.start, label %for.body

for.body:
  %i.034 = phi i32 [ %inc, %for.body ], [ 0, %polly.preload.begin ]
  %arrayidx = getelementptr inbounds i16, i16* %2, i32 %i.034
  %3 = load i16, i16* %arrayidx, align 2
  %add = add i16 %3, 10
  store i16 %add, i16* %arrayidx, align 2
  %inc = add nuw nsw i32 %i.034, 1
  %cmp = icmp slt i32 %inc, %1
  br i1 %cmp, label %for.body, label %for.cond2.preheader
..
..
..
polly.loop_if:
  br i1 false, label %polly.loop_preheader, label %polly.loop_exit

polly.loop_header:
  %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.for.body ]
  br label %polly.stmt.for.body

polly.stmt.for.body:
  %10 = trunc i64 %polly.indvar to i32
  %11 = shl i32 %10, 1
  %scevgep = getelementptr i8, i8* %call, i32 %11
  %scevgep1 = bitcast i8* %scevgep to i16*
  %_p_scalar_ = load i16, i16* %scevgep1, align 2, !alias.scope !0, !noalias !2
  %p_add = add i16 %_p_scalar_, 10
  store i16 %p_add, i16* %scevgep1, align 2, !alias.scope !0, !noalias !2
  %12 = trunc i64 %polly.indvar to i32
  %13 = add i32 %12, 1
  %p_cmp = icmp slt i32 %13, %1
  %polly.indvar_next = add nsw i64 %polly.indvar, 1
  %polly.loop_cond = icmp slt i64 %polly.indvar, -1
  br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit

polly.loop_preheader:
  br label %polly.loop_header

When run with -simplifycfg

define void @foo1() #0 {
entry:
  %0 = load i32, i32* @n, align 4
  %call = tail call i8* @foo(i32 %0)
  store i8* %call, i8** bitcast (i16** @A to i8**), align 4
  %1 = load i32, i32* @n, align 4
  %cmp33 = icmp sgt i32 %1, 0
  %2 = bitcast i8* %call to i16*
  br i1 %cmp33, label %polly.stmt.for.body2, label %for.end20

for.end20:                                        ; preds =
%polly.stmt.for.body2, %entry
  ret void

polly.stmt.for.body2:                             ; preds = %entry
  %scevgep3 = getelementptr i8, i8* %call, i32 0
  %scevgep34 = bitcast i8* %scevgep3 to i16*
  %_p_scalar_5 = load i16, i16* %scevgep34, align 2, !alias.scope !0, !noalias !2
  %p_add6 = add i16 %_p_scalar_5, 10
  store i16 %p_add6, i16* %scevgep34, align 2, !alias.scope !0, !noalias !2
  %p_cmp7 = icmp slt i32 1, %1
  br label %for.end20
}
Quuxplusone commented 8 years ago

Attached test.ll (3586 bytes, application/octet-stream): reduced test

Quuxplusone commented 8 years ago
The schedule also might be incorrect,

if (p_0_loaded_from_n <= 64)

    {
      for (int c0 = 0; c0 < p_0_loaded_from_n; c0 += 1)
        Stmt_for_body(c0);
      if (p_0_loaded_from_n <= 0)   <====
        Stmt_for_body(0);           <====
    }

else
    {  /* original code */ }
Quuxplusone commented 8 years ago

I don't see any dead code here, sorry. For p_0_loaded_from_n > 0 The loop is executed p_0_loaded_from_n times. For p_0_loaded_from_n <= 0 the loop body is executed once as it was a do-while loop before. Note that this block was always executed before the loop backedge condition was evaluated.

Quuxplusone commented 8 years ago
I apologize, I misrepresented the problem. What I meant was the loop was being
removed, consider the output,

opt  -polly-process-unprofitable -polly-code-generator=isl -polly-scops -polly-
codegen test.ll -S

define void @foo1() #0 {
entry:
  br label %entry.split

entry.split:                                      ; preds = %entry
  %0 = load i32, i32* @n, align 4
  %call = tail call i8* @foo(i32 %0)
  store i8* %call, i8** bitcast (i16** @A to i8**), align 4
  %1 = load i32, i32* @n, align 4
  %cmp33 = icmp sgt i32 %1, 0
  %2 = bitcast i8* %call to i16*
  br i1 %cmp33, label %for.body.preheader, label %for.end20

for.body.preheader:                               ; preds = %entry.split
  br label %polly.split_new_and_old

for.cond2.preheader:                              ; preds = %for.body
  %cmp331 = icmp sgt i32 %1, 64
  br i1 %cmp331, label %for.body5.preheader, label %for.end20.region_exiting

for.body5.preheader:                              ; preds = %for.cond2.preheader
  br label %for.body5

polly.split_new_and_old:                          ; preds = %for.body.preheader
  br label %polly.preload.begin

polly.preload.begin:                              ; preds =
%polly.split_new_and_old
  br i1 true, label %polly.start, label %for.body

for.body:                                         ; preds =
%polly.preload.begin, %for.body
  %i.034 = phi i32 [ %inc, %for.body ], [ 0, %polly.preload.begin ]
  %arrayidx = getelementptr inbounds i16, i16* %2, i32 %i.034
  %3 = load i16, i16* %arrayidx, align 2
  %add = add i16 %3, 10
  store i16 %add, i16* %arrayidx, align 2
  %inc = add nuw nsw i32 %i.034, 1
  %cmp = icmp slt i32 %inc, %1
  br i1 %cmp, label %for.body, label %for.cond2.preheader

for.body5:                                        ; preds = %for.inc18,
%for.body5.preheader
  %i.132 = phi i32 [ %inc19, %for.inc18 ], [ 64, %for.body5.preheader ]
  %4 = load i32*, i32** @Y, align 4
  %arrayidx6 = getelementptr inbounds i32, i32* %4, i32 %i.132
  %5 = load i32, i32* %arrayidx6, align 4
  %cmp7 = icmp sgt i32 %5, -1
  br i1 %cmp7, label %land.lhs.true, label %for.inc18

land.lhs.true:                                    ; preds = %for.body5
  %6 = load i16*, i16** @A, align 4
  %arrayidx9 = getelementptr inbounds i16, i16* %6, i32 %i.132
  %7 = load i16, i16* %arrayidx9, align 2
  %cmp11 = icmp eq i16 %7, 1
  br i1 %cmp11, label %land.lhs.true13, label %for.inc18

land.lhs.true13:                                  ; preds = %land.lhs.true
  %call14 = tail call i32 @foo3(i32 %i.132)
  %cmp15 = icmp eq i32 %call14, 10
  br i1 %cmp15, label %if.then, label %for.inc18

if.then:                                          ; preds = %land.lhs.true13
  %8 = load i32*, i32** @C, align 4
  %arrayidx17 = getelementptr inbounds i32, i32* %8, i32 %i.132
  store i32 -2, i32* %arrayidx17, align 4
  br label %for.inc18

for.inc18:                                        ; preds = %if.then,
%land.lhs.true13, %land.lhs.true, %for.body5
  %inc19 = add nuw nsw i32 %i.132, 1
  %9 = load i32, i32* @n, align 4
  %cmp3 = icmp slt i32 %inc19, %9
  br i1 %cmp3, label %for.body5, label %for.end20.loopexit

for.end20.loopexit:                               ; preds = %for.inc18
  br label %for.end20.region_exiting

for.end20.region_exiting:                         ; preds =
%for.cond2.preheader, %for.end20.loopexit
  br label %polly.merge_new_and_old

polly.merge_new_and_old:                          ; preds = %polly.merge,
%for.end20.region_exiting
  br label %for.end20

for.end20:                                        ; preds =
%polly.merge_new_and_old, %entry.split
  ret void

polly.start:                                      ; preds = %polly.preload.begin
  br label %polly.loop_if

polly.loop_exit:                                  ; preds =
%polly.stmt.for.body, %polly.loop_if
  br label %polly.cond

polly.cond:                                       ; preds = %polly.loop_exit
  br i1 true, label %polly.then, label %polly.else

polly.merge:                                      ; preds = %polly.else,
%polly.stmt.for.body2
  br label %polly.merge_new_and_old

polly.loop_if:                                    ; preds = %polly.start
  br i1 false, label %polly.loop_preheader, label %polly.loop_exit

polly.loop_header:                                ; preds =
%polly.stmt.for.body, %polly.loop_preheader
  %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.for.body ]
  br label %polly.stmt.for.body

polly.stmt.for.body:                              ; preds = %polly.loop_header
  %10 = trunc i64 %polly.indvar to i32
  %11 = shl i32 %10, 1
  %scevgep = getelementptr i8, i8* %call, i32 %11
  %scevgep1 = bitcast i8* %scevgep to i16*
  %_p_scalar_ = load i16, i16* %scevgep1, align 2, !alias.scope !0, !noalias !2
  %p_add = add i16 %_p_scalar_, 10
  store i16 %p_add, i16* %scevgep1, align 2, !alias.scope !0, !noalias !2
  %12 = trunc i64 %polly.indvar to i32
  %13 = add i32 %12, 1
  %p_cmp = icmp slt i32 %13, %1
  %polly.indvar_next = add nsw i64 %polly.indvar, 1
  %polly.loop_cond = icmp slt i64 %polly.indvar, -1
  br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit

polly.loop_preheader:                             ; preds = %polly.loop_if
  br label %polly.loop_header

polly.then:                                       ; preds = %polly.cond
  br label %polly.stmt.for.body2

polly.stmt.for.body2:                             ; preds = %polly.then
  %scevgep3 = getelementptr i8, i8* %call, i32 0
  %scevgep34 = bitcast i8* %scevgep3 to i16*
  %_p_scalar_5 = load i16, i16* %scevgep34, align 2, !alias.scope !0, !noalias !2
  %p_add6 = add i16 %_p_scalar_5, 10
  store i16 %p_add6, i16* %scevgep34, align 2, !alias.scope !0, !noalias !2
  %p_cmp7 = icmp slt i32 1, %1
  br label %polly.merge

polly.else:                                       ; preds = %polly.cond
  br label %polly.merge
}

Consider when n > 0

entry->entry.split->
for.body.preheader->
polly.split_new_and_old->
polly.preload.begin->  (true)
polly.start->
polly.loop_if-> (false)
polly.loop_exit->
polly.cond-> (true)
polly.then->
polly.stmt.for.body2->
polly.merge->
polly.merge_new_and_old->
for.end20->end

In polly.stmt.for.body2, it is executed just once.

When the n < 0

entry->entry.split->
for.end20->end

It looks like the upperbound is not being materialized in the
IslNodebBuilder::preloadInvariantLoop.
Quuxplusone commented 8 years ago
Unreached loops get removed by simplifycfg,(In reply to comment #0) and we get
the below output.

> When run with -simplifycfg
>
>
> define void @foo1() #0 {
> entry:
>   %0 = load i32, i32* @n, align 4
>   %call = tail call i8* @foo(i32 %0)
>   store i8* %call, i8** bitcast (i16** @A to i8**), align 4
>   %1 = load i32, i32* @n, align 4
>   %cmp33 = icmp sgt i32 %1, 0
>   %2 = bitcast i8* %call to i16*
>   br i1 %cmp33, label %polly.stmt.for.body2, label %for.end20
>
> for.end20:                                        ; preds =
> %polly.stmt.for.body2, %entry
>   ret void
>
> polly.stmt.for.body2:                             ; preds = %entry
>   %scevgep3 = getelementptr i8, i8* %call, i32 0
>   %scevgep34 = bitcast i8* %scevgep3 to i16*
>   %_p_scalar_5 = load i16, i16* %scevgep34, align 2, !alias.scope !0,
> !noalias !2
>   %p_add6 = add i16 %_p_scalar_5, 10
>   store i16 %p_add6, i16* %scevgep34, align 2, !alias.scope !0, !noalias !2
>   %p_cmp7 = icmp slt i32 1, %1
>   br label %for.end20
> }
Quuxplusone commented 8 years ago

"false" as condition for the branch in the polly.loop_if block is definitively hinky. I will take a look at this!

Quuxplusone commented 8 years ago

Move bugs to Polly product.