google / souper

A superoptimizer for LLVM IR
Apache License 2.0
2.13k stars 170 forks source link

missed optimization #157

Open regehr opened 9 years ago

regehr commented 9 years ago

This one assumes that we take Yang's fix to #154.

The program below has two addition functions that do overflow checking. They are exactly equivalent. Souper is able to prove that check_result() never throws an assertion but for some reason it does not prove that check_overflow() never throws an assertion. Can someone look into it?

Result is independent of solver choice and I'm compiling like this:

~/f/souper-yang/build/sclang -O3 add.c -Wall -S -emit-llvm

C code:

#include <limits.h>
#include <stdint.h>
#include <assert.h>
#include <stdio.h>

static const int BITS = CHAR_BIT * sizeof(int16_t);

static int checked_add_2(int16_t a, int16_t b, int16_t *rp) {
  int16_t r = (uint16_t)a + (uint16_t)b;
  *rp = r;
  return (a < 0 && b < 0 && r >= 0) || (a >= 0 && b >= 0 && r < 0);
}

static int checked_add_3(int16_t a, int16_t b, int16_t *rp) {
  uint16_t ur = (uint16_t)a + (uint16_t)b;
  uint16_t sr = ur >> (BITS - 1);
  uint16_t sa = (uint16_t)a >> (BITS - 1);
  uint16_t sb = (uint16_t)b >> (BITS - 1);
  *rp = ur;
  return (sa && sb && !sr) || (!sa && !sb && sr);
}

void check_result(long x, long y) {
  int16_t r1, r2;
  checked_add_2(x, y, &r1);
  checked_add_3(x, y, &r2);
  assert(r1 == r2);
}

void check_overflow(long x, long y) {
  int16_t r1, r2;
  int o1 = checked_add_2(x, y, &r1);
  int o2 = checked_add_3(x, y, &r2);
  assert(o1 == o2);
}

int main (void)
{
  long x, y, c;
  c = 0;
  for (x=INT16_MIN; x<=INT16_MAX; x++) {
    for (y=INT16_MIN; y<=INT16_MAX; y++) {
      check_result(x, y);
      check_overflow(x, y);
      c++;
    }
  }
  printf ("ran 0x%lx tests\n", c);
  return 0;
}
regehr commented 9 years ago

Here's the LLVM that I get -- I don't see any obvious Souper stoppers in there.

; Function Attrs: nounwind uwtable
define void @check_result(i64 %x, i64 %y) #0 {
entry:
  ret void
}

; Function Attrs: noreturn nounwind
declare void @__assert_fail(i8*, i8*, i32, i8*) #1

; Function Attrs: nounwind uwtable
define void @check_overflow(i64 %x, i64 %y) #0 {
entry:
  %conv = trunc i64 %x to i16
  %conv1 = trunc i64 %y to i16
  %add.i = add i16 %conv1, %conv
  %0 = and i64 %y, %x
  %1 = trunc i64 %0 to i16
  %2 = icmp slt i16 %1, 0
  %cmp10.i = icmp sgt i16 %add.i, -1
  %or.cond28.i = and i1 %2, %cmp10.i
  br i1 %or.cond28.i, label %checked_add_2.exit, label %lor.rhs.i

lor.rhs.i:                                        ; preds = %entry
  %3 = or i64 %y, %x
  %4 = trunc i64 %3 to i16
  %5 = icmp sgt i16 %4, -1
  %cmp20.i = icmp slt i16 %add.i, 0
  %cmp20..i = and i1 %5, %cmp20.i
  br label %checked_add_2.exit

checked_add_2.exit:                               ; preds = %entry, %lor.rhs.i
  %6 = phi i1 [ true, %entry ], [ %cmp20..i, %lor.rhs.i ]
  %shr28.i = lshr i16 %add.i, 15
  %shr629.i = lshr i16 %conv, 15
  %shr930.i = lshr i16 %conv1, 15
  %tobool.i = icmp eq i16 %shr629.i, 0
  br i1 %tobool.i, label %checked_add_3.exit, label %land.lhs.true.i

land.lhs.true.i:                                  ; preds = %checked_add_2.exit
  %tobool13.i = icmp eq i16 %shr930.i, 0
  %tobool15.i = icmp ne i16 %shr28.i, 0
  %or.cond.i = or i1 %tobool13.i, %tobool15.i
  br i1 %or.cond.i, label %checked_add_3.exit, label %land.lhs.true.i.checked_add_3.exit_crit_edge

land.lhs.true.i.checked_add_3.exit_crit_edge:     ; preds = %land.lhs.true.i
  br i1 %6, label %cond.end, label %cond.false

checked_add_3.exit:                               ; preds = %checked_add_2.exit, %land.lhs.true.i
  %7 = or i16 %shr930.i, %shr629.i
  %8 = icmp eq i16 %7, 0
  %tobool20.i = icmp ne i16 %shr28.i, 0
  %tobool20..i = and i1 %8, %tobool20.i
  %cmptmp = xor i1 %6, %tobool20..i
  br i1 %cmptmp, label %cond.false, label %cond.end

cond.false:                                       ; preds = %land.lhs.true.i.checked_add_3.exit_crit_edge, %checked_add_3.exit
  tail call void @__assert_fail(i8* getelementptr inbounds ([9 x i8]* @.str2, i64 0, i64 0), i8* getelementptr inbounds ([6 x i8]* @.str1, i64 0, i64 0), i32 34, i8* getelementptr inbounds ([32 x i8]* @__PRETTY_FUNCTION__.check_overflow, i64 0, i64 0)) #3
  unreachable

cond.end:                                         ; preds = %land.lhs.true.i.checked_add_3.exit_crit_edge, %checked_add_3.exit
  ret void
}
regehr commented 9 years ago

Actually it is LLVM, not Souper, that proves check_result() is empty.

regehr commented 9 years ago

Here's the Souper in question:

%0 = block 3
%1:i64 = var                    ; x
%2:i16 = trunc %1               ; a
%3:i32 = sext %2                ; a promoted to int
%4:i1 = slt %3, 0:i32           ; a < 0
%5:i64 = var                    ; y
%6:i16 = trunc %5               ; b
%7:i32 = sext %6                ; b promoted to int
%8:i1 = slt %7, 0:i32           ; b < 0
%9:i1 = and %4, %8              ; a < 0 && b < 0
%10:i32 = zext %2               ; a promoted to int
%11:i32 = zext %6               ; b promoted to int
%12:i32 = addnsw %10, %11       ; a + b
%13:i16 = trunc %12             ; r
%14:i32 = sext %13              ; r promoted to int
%15:i1 = sle 0:i32, %14         ; 0 <= r
%16:i1 = and %9, %15            ; a < 0 && b < 0 && r >= 0
blockpc %0 1 %16 0:i1
blockpc %0 2 %16 0:i1
%17:i1 = sle 0:i32, %3          ; 0 <= a
%18:i1 = sle 0:i32, %7          ; 0 <= b
%19:i1 = and %17, %18           ; a >= 0 && b >= 0
blockpc %0 2 %19 1:i1
%20 = block 3
%21:i32 = ashr %10, 15:i32      ; a >> 15
%22:i16 = trunc %21             ; a >> 15
%23:i32 = zext %22              ; a >> 15 promoted to int
%24:i1 = ne 0:i32, %23          ; (a >> 15) != 0
%25:i32 = ashr %11, 15:i32      ; b >> 15
%26:i16 = trunc %25             ; b >> 15
%27:i32 = zext %26              ; b >> 15 promoted to int
%28:i1 = ne 0:i32, %27          ; (b >> 15) != 0
%29:i1 = and %24, %28           ; ((a >> 15) != 0 && (b >> 15) != 0
%30:i1 = xor 1:i1, %29          ; ((a >> 15) == 0 || (b >> 15) == 0
%31:i32 = zext %13              ; r promoted to int
%32:i32 = ashr %31, 15:i32      ; r >> 15
%33:i16 = trunc %32             ; r >> 15
%34:i1 = ne 0:i16, %33          ; (r >> 15) != 0
%35:i1 = or %30, %34            ; ((a >> 15) == 0 || (b >> 15) == 0 || (r >> 15) != 0
blockpc %20 1 %35 1:i1
blockpc %20 2 %35 1:i1
%36:i1 = ne 0:i16, %22          ; (a >> 15) != 0
%37:i1 = ne 0:i16, %26          ; (b >> 15) != 0
%38:i1 = or %36, %37            ; (a >> 15) != 0 || (b >> 15) != 0
blockpc %20 2 %38 0:i1
%39:i1 = slt %14, 0:i32         ; r < 0
%40:i1 = phi %0, 1:i1, 0:i1, %39
%41:i32 = zext %40
%42:i32 = zext %33
%43:i1 = ne 0:i32, %42
%44:i1 = phi %20, 1:i1, 0:i1, %43
%45:i32 = zext %44
%46:i1 = eq %41, %45
infer %46
regehr commented 9 years ago

And below is the LLVM. I believe this particular LLVM is correct since I tested it with all 2^32 inputs and the assertion never fails. Thus I think it's safe to say that Souper is either buggy or else somehow not up to the task of seeing the meaning of this code.

; ModuleID = 'add_souper_test_2.bc'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@.str = private unnamed_addr constant [9 x i8] c"o1 == o2\00", align 1
@.str1 = private unnamed_addr constant [18 x i8] c"add_souper_test.c\00", align 1
@__PRETTY_FUNCTION__.check_overflow = private unnamed_addr constant [32 x i8] c"void check_overflow(long, long)\00", align 1

; Function Attrs: nounwind uwtable
define void @check_overflow(i64 %x, i64 %y) #0 {
entry:
  %conv = trunc i64 %x to i16
  %conv1 = trunc i64 %y to i16
  %conv.i = zext i16 %conv to i32
  %conv1.i = zext i16 %conv1 to i32
  %add.i = add nsw i32 %conv.i, %conv1.i
  %conv2.i = trunc i32 %add.i to i16
  %conv3.i = sext i16 %conv to i32
  %cmp.i = icmp slt i32 %conv3.i, 0
  %conv5.i = sext i16 %conv1 to i32
  %cmp6.i = icmp slt i32 %conv5.i, 0
  %or.cond.i = and i1 %cmp.i, %cmp6.i
  %conv9.i = sext i16 %conv2.i to i32
  %cmp10.i = icmp sge i32 %conv9.i, 0
  %or.cond7.i = and i1 %or.cond.i, %cmp10.i
  br i1 %or.cond7.i, label %checked_add_2.exit, label %lor.rhs.i

lor.rhs.i:                                        ; preds = %entry
  %conv12.i = sext i16 %conv to i32
  %cmp13.i = icmp sge i32 %conv12.i, 0
  %conv16.i = sext i16 %conv1 to i32
  %cmp17.i = icmp sge i32 %conv16.i, 0
  %or.cond8.i = and i1 %cmp13.i, %cmp17.i
  br i1 %or.cond8.i, label %land.rhs.i, label %checked_add_2.exit

land.rhs.i:                                       ; preds = %lor.rhs.i
  %conv19.i = sext i16 %conv2.i to i32
  %cmp20.i = icmp slt i32 %conv19.i, 0
  br label %checked_add_2.exit

checked_add_2.exit:                               ; preds = %land.rhs.i, %lor.rhs.i, %entry
  %0 = phi i1 [ true, %entry ], [ false, %lor.rhs.i ], [ %cmp20.i, %land.rhs.i ]
  %lor.ext.i = zext i1 %0 to i32
  %conv2 = trunc i64 %x to i16
  %conv3 = trunc i64 %y to i16
  %conv.i3 = zext i16 %conv2 to i32
  %conv1.i4 = zext i16 %conv3 to i32
  %add.i5 = add nsw i32 %conv.i3, %conv1.i4
  %conv2.i6 = trunc i32 %add.i5 to i16
  %conv3.i7 = zext i16 %conv2.i6 to i32
  %shr.i = ashr i32 %conv3.i7, 15
  %conv4.i = trunc i32 %shr.i to i16
  %conv5.i8 = zext i16 %conv2 to i32
  %shr6.i = ashr i32 %conv5.i8, 15
  %conv7.i = trunc i32 %shr6.i to i16
  %conv8.i = zext i16 %conv3 to i32
  %shr9.i = ashr i32 %conv8.i, 15
  %conv10.i = trunc i32 %shr9.i to i16
  %conv11.i = zext i16 %conv7.i to i32
  %tobool.i = icmp ne i32 %conv11.i, 0
  %conv12.i9 = zext i16 %conv10.i to i32
  %tobool13.i = icmp ne i32 %conv12.i9, 0
  %or.cond.i10 = and i1 %tobool.i, %tobool13.i
  %or.cond.not.i = xor i1 %or.cond.i10, true
  %tobool15.i = icmp ne i16 %conv4.i, 0
  %or.cond7.i11 = or i1 %or.cond.not.i, %tobool15.i
  br i1 %or.cond7.i11, label %lor.rhs.i13, label %checked_add_3.exit

lor.rhs.i13:                                      ; preds = %checked_add_2.exit
  %tobool16.i = icmp ne i16 %conv7.i, 0
  %tobool18.i = icmp ne i16 %conv10.i, 0
  %or.cond8.i12 = or i1 %tobool16.i, %tobool18.i
  br i1 %or.cond8.i12, label %checked_add_3.exit, label %land.rhs.i15

land.rhs.i15:                                     ; preds = %lor.rhs.i13
  %conv19.i14 = zext i16 %conv4.i to i32
  %tobool20.i = icmp ne i32 %conv19.i14, 0
  br label %checked_add_3.exit

checked_add_3.exit:                               ; preds = %land.rhs.i15, %lor.rhs.i13, %checked_add_2.exit
  %1 = phi i1 [ true, %checked_add_2.exit ], [ false, %lor.rhs.i13 ], [ %tobool20.i, %land.rhs.i15 ]
  %lor.ext.i16 = zext i1 %1 to i32
  %cmp = icmp eq i32 %lor.ext.i, %lor.ext.i16
  br i1 %cmp, label %cond.end, label %cond.false

cond.false:                                       ; preds = %checked_add_3.exit
  call void @__assert_fail(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([18 x i8]* @.str1, i32 0, i32 0), i32 27, i8* getelementptr inbounds ([32 x i8]* @__PRETTY_FUNCTION__.check_overflow, i32 0, i32 0)) #2
  unreachable

cond.end:                                         ; preds = %checked_add_3.exit
  ret void
}

; Function Attrs: noreturn nounwind
declare void @__assert_fail(i8*, i8*, i32, i8*) #1

attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { noreturn nounwind }

!llvm.ident = !{!0}

!0 = metadata !{metadata !"clang version 3.6.0 (trunk 223586)"}
regehr commented 9 years ago

My guess is somehow we're asking too much of blockpc here but I haven't looked into it at that level.

rsas commented 9 years ago

I'll have a look at the blockpc code soon.

chenyang78 commented 9 years ago

John, Thanks for the detailed example! I will investigate it.

rsas commented 9 years ago

What I'm not sure about is if the bug is in the blockpc harvester or the solving part.

regehr commented 9 years ago

If I code the overflow checks in branch-free fashion, Souper optimizes everything properly:

static int checked_add_2(int16_t a, int16_t b, int16_t *rp) {
  int16_t r = (uint16_t)a + (uint16_t)b;
  *rp = r;
  // return (a < 0 && b < 0 && r >= 0) || (a >= 0 && b >= 0 && r < 0);
  return (a < 0 & b < 0 & r >= 0) | (a >= 0 & b >= 0 & r < 0);
}

static int checked_add_3(int16_t a, int16_t b, int16_t *rp) {
  const int BITS = CHAR_BIT * sizeof(int16_t);
  uint16_t ur = (uint16_t)a + (uint16_t)b;
  uint16_t sr = ur >> (BITS - 1);
  uint16_t sa = (uint16_t)a >> (BITS - 1);
  uint16_t sb = (uint16_t)b >> (BITS - 1);
  *rp = ur;
  // return (sa && sb && !sr) || (!sa && !sb && sr);
  return (sa & sb & !sr) | (!sa & !sb & sr);
}
regehr commented 9 years ago

Here's a simpler example where Souper misses the obvious optimization. Something definitely is wrong here.

#include <assert.h>

int sign(int x) {
  if (x < 0)
    return -1;
  if (x > 0)
    return 1;
  return 0;
}

int sign2(int x) {
  int y = x >> 31;
  unsigned z = 0 - (unsigned)x;
  unsigned w = z >> 31;
  return w | y;
}

void compare(int x) { assert(sign(x) == sign2(x)); }

#ifdef TEST

int main(void) {
  unsigned x = 0;
  do {
    compare(x);
    x++;
  } while (x != 0);
  return 0;
}

#endif
jeroenk commented 9 years ago

@regehr @chenyang78 This looks rather bad. sign is optimized to

define i32 @sign(i32 %x) #0 {
entry:
  %cmp = icmp slt i32 %x, 0
  br i1 %cmp, label %return, label %if.end

if.end:                                           ; preds = %entry
  %cmp1 = icmp sgt i32 %x, 0
  %. = zext i1 %cmp1 to i32
  br label %return

return:                                           ; preds = %if.end, %entry
  %retval.0 = phi i32 [ -1, %entry ], [ %., %if.end ]
  ret i32 %retval.0
}

Souper takes this and simplifies this to ret i32 0. This seems related to issue #166. The conditions generated from the if.end block are not sufficiently guarded to faithfully represent the execution which skips that block and immediately jumps to the return block.

chenyang78 commented 9 years ago

Thanks John and Jeroen, I will look at it more closely and try to fix #166 over the weekend.

chenyang78 commented 9 years ago

@jeroenk In my test, souper didn't optimize the code that you referred. Assume I saved the bitcode in the file simple2.ll:

    $ llvm-as -o simple2.bc simple2.ll
    $ souper -keep-solver-inputs -dump-klee-exprs -z3-path=/home/chenyang/other_programs/z3/build/z3 simple2.bc
    ; Listing valid replacements.
    ; Using solver: Z3 + internal cache
    Solver input saved to /tmp/input-eb9b23.smt2
    Solver input saved to /tmp/input-efb132.smt2
    Solver input saved to /tmp/input-7ac65c.smt2
    Solver input saved to /tmp/input-2a4427.smt2

Probably I am missing something?

jeroenk commented 9 years ago

@chenyang78 try passing -souper-infer-iN

jeroenk commented 9 years ago

@chenyang78 I'm a bit puzzled, I'm no longer able to reproduce the problem either. I'll keep digging a bit more.

jeroenk commented 9 years ago

@chenyang78 I'm starting to suspect that my binaries were out-of-sync compared to the state of my source tree. Hence, my apologies for the spurious report.

jeroenk commented 9 years ago

Back to John's last example. This gets optimized to:

entry:
  %cmp.i = icmp slt i32 %x, 0
  br i1 %cmp.i, label %sign.exit, label %if.end.i

if.end.i:                                         ; preds = %entry
  %cmp1.i = icmp sgt i32 %x, 0
  %..i = zext i1 %cmp1.i to i32
  br label %sign.exit

sign.exit:                                        ; preds = %if.end.i, %entry
  %retval.0.i = phi i32 [ -1, %entry ], [ %..i, %if.end.i ]
  %shr.i = ashr i32 %x, 31
  %sub.i = sub i32 0, %x
  %shr1.i = lshr i32 %sub.i, 31
  %or.i = or i32 %shr1.i, %shr.i
  %lnot = icmp eq i32 %retval.0.i, %or.i

...

The candidate souper generates for the eq instruction at the bottom is:

%0 = block 2
%1:i32 = var
%2:i1 = slt %1, 0:i32
blockpc %0 1 %2 0:i1
%3:i1 = slt 0:i32, %1
%4:i32 = zext %3
%5:i32 = phi %0, 4294967295:i32, %4
%6:i32 = sub 0:i32, %1
%7:i32 = lshr %6, 31:i32
%8:i32 = ashr %1, 31:i32
%9:i32 = or %7, %8
%10:i1 = eq %5, %9
infer %10

In case we follow the code path corresponding to the 4294967295:i32 (= -1) alternative in the phi node, the value of %1 is not getting constrained to a negative value. This means that the solver is allowed to pick a positive value, set %5 equal to -1 and then notice that %9 is either 0 or 1. Thus, no optimization is found. If I manually add blockpc %0 0 %2 1:i1 to the above candidate then the optimization is found. Hence, it seems that we're generating too few blockpc's.

chenyang78 commented 9 years ago

@jeroenk No problem at all. Thanks for digging into the issue!

The blockpc that you manually added was not generated by Souper's blockpc harvesting algorithm, because the basicblock sign.exit has multiple incoming branches. I am thinking that probably we shouldn't rule out the multiple-incoming-branches case. What do you think, Peter @pcc? Thanks.

regehr commented 9 years ago

@pcc might be busy doing other stuff?

Yang, are the modifications difficult? Clearly this needs to wait until #166 is sorted out. I keep running into additional test cases where this fix will matter.

chenyang78 commented 9 years ago

Yes, it needs some modification. I think we should wait until #166 gets fixed.

regehr commented 9 years ago

Sounds good-- we'll back burner this one until the no-UB version of Souper is totally solid.

My impression is that real support for for LLVM's (very nasty) UB model is going to take some time and thought.

chenyang78 commented 9 years ago

Yes. Thanks for running the test. Hopefully the fix to #166 would be good.

MaxGraey commented 5 years ago

Could souper simpilfy all int sign variants to:

int sign(int x) {
  return (x > 0) + (x >> 31);
}

Which usually optimal for all platforms?