llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
29.16k stars 12.03k forks source link

Interprocedural scalar replacement of aggregates miscompile #33479

Open llvmbot opened 7 years ago

llvmbot commented 7 years ago
Bugzilla Link 34132
Version unspecified
OS Windows NT
Reporter LLVM Bugzilla Contributor
CC @chandlerc,@efriedma-quic,@gregbedwell,@zmodem,@RKSimon,@rgal,@rotateright,@wjristow,@yuanfang-chen

Extended Description

source.c:

#include <stdio.h>

struct {
  unsigned char a[4];
  unsigned char b[4];
} s;

static void init_arr(volatile void *data, unsigned char size) {
  unsigned char *bytes = (unsigned char *)data;
  for (unsigned char i = 0; i != size; ++i) {
    bytes[i] = i;
  }
}

int main() {
  init_arr(&s, sizeof(s) );
  printf("a:%x %x %x %x\n", s.a[0], s.a[1], s.a[2], s.a[3]);
  printf("b:%x %x %x %x\n", s.b[0], s.b[1], s.b[2], s.b[3]);
}

clang -Oz -flto  source.c
or
clang -O0 -flto  source.c
or
clang -O1 -flto  source.c

<main>:
    xor    %eax,%eax
    lea    0x7fcb(%rip),%rcx        # 80a8 <__preinit_array_end>
    jmp    e5 <__start__Zdynsym+0x5>
    mov    %al,(%rax,%rcx,1)
    inc    %rax
    cmp    $0x8,%rax
    jne    df <__dynstr_end+0x6>
    push   %rbp
    mov    %rsp,%rbp
    movzbl 0x7fb2(%rip),%esi        # 80a8 <__preinit_array_end>
    movzbl 0x7fac(%rip),%edx        # 80a9 <__preinit_array_end+0x1>
    movzbl 0x7fa6(%rip),%ecx        # 80aa <__preinit_array_end+0x2>
    movzbl 0x7f9f(%rip),%r8d        # 80ab <__preinit_array_end+0x3>

    lea    0xdd(%rip),%rdi        # 1f0 <.L.str>
    xor    %eax,%eax
    callq  1a0 </PLprintf>
    lea    0xde(%rip),%rdi        # 1ff <.L.str.1>
    xor    %esi,%esi
    xor    %edx,%edx
    xor    %ecx,%ecx
    xor    %r8d,%r8d
    xor    %eax,%eax
    callq  1a0 </PLprintf>
    xor    %eax,%eax
    pop    %rbp
    retq   

With LTO optimization clang assumes that s.b[0] == s.b[1] == s.b[2] == s.b[3] == 0 and generates all zero values for the second printf().

With optimization level -O2 or -O3 and -flto, clang produces correct code:

<main>:
    push   %rbp
    mov    %rsp,%rbp
    lea    0xf5(%rip),%rdi        # 1e0 <.L.str>
    mov    $0x0,%esi
    mov    $0x1,%edx
    mov    $0x2,%ecx
    mov    $0x3,%r8d
    xor    %eax,%eax
    callq  190 </PLprintf>
    lea    0xe1(%rip),%rdi        # 1ef <.L.str.1>
    mov    $0x4,%esi
    mov    $0x5,%edx
    mov    $0x6,%ecx
    mov    $0x7,%r8d
    xor    %eax,%eax
    callq  190 </PLprintf>
    xor    %eax,%eax
    pop    %rbp
    retq
llvmbot commented 7 years ago

I tried to look at this one, but I wasn't thrilled about any of the fixes I was able to come up with. I guess a short term solution would be still to have what I propose in comment#4, although I'm not exactly sure about the performance implications. This code has been generated in my organization using a fuzz tester, so it's not something seen in the wild. I'd rather not rush a fix in 5.0.

zmodem commented 7 years ago

This has been there probably since forever (2004), but it's a serious miscompile, so tentatively marking this as blocker.

It doesn't seem like this is getting fixed for 5.0 :-/ Unblocking.

I'm still open to take a fix if it emerges real soon.

efriedma-quic commented 7 years ago

"inbounds" just means that the result has to be somewhere inside the allocated memory; you're still allowed to index outside the bounds of the GEP's type. (In theory, you could use inrange to drive SROA decisions, but we only support for that in limited circumstances, so not really useful at the moment.)

llvmbot commented 7 years ago

Andrea made me notice that actually there's something iffy in the input IR:

define internal fastcc void @&#8203;init_arr() unnamed_addr #&#8203;1 {
entry:
  br label %for.body

for.cond.cleanup:                                 ; preds = %for.body
  ret void

for.body:                                         ; preds = %for.body, %entry
  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds %struct.anon, %struct.anon* @&#8203;s, i64 0, i32 0, i64 %indvars.iv
  %0 = trunc i64 %indvars.iv to i8
  store i8 %0, i8* %arrayidx, align 1
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %cmp = icmp eq i64 %indvars.iv.next, 8
  br i1 %cmp, label %for.cond.cleanup, label %for.body
}

The GEP results in out-of-bounds accesses. I guess this is technically UB? Still, as IPSRA bails out when there are out-of-bounds accesses for arrays we might try to be nice and detect some situations also for struct and prevent the optimization in these cases.

llvmbot commented 7 years ago

CC:ing some people who understand SROA better than I do. I've been thinking about this a little more, and I think it's going to be hard making this transformation safe (if not impossible) without rewriting the loop.

My proposal is that of changing the function that checks for safety [IsUserOfGlobalSafeForSRA()] to give up if the gep indices are not all constants (currently it just checks for the first two).

This would result in some optimization loss when trying to do partial SRA interprocedurally (see, e.g. test/Transforms/GlobalOpt/globalsra-partial.ll) but I'm not sure if it's going to matter a lot in practice.

Happy to hear about alternatives, if you have something in mind.

llvmbot commented 7 years ago

This has been there probably since forever (2004), but it's a serious miscompile, so tentatively marking this as blocker.

llvmbot commented 7 years ago

This is a nasty IP-SRA bug. The interprocedural SROA (inside GlobalOpt) replaces

%struct.anon = type { [4 x i8], [4 x i8] }

with:

@​s.0 = internal global [4 x i8] zeroinitializer @​s.1 = internal global [4 x i8] zeroinitializer

And in @​init_arr() : define internal fastcc void @​init_arr() unnamed_addr { entry: br label %for.body

for.cond.cleanup: ; preds = %for.body ret void

for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %arrayidx.0 = getelementptr [4 x i8], [4 x i8] @​s.0, i32 0, i64 %indvars.iv %0 = trunc i64 %indvars.iv to i8 store i8 %0, i8 %arrayidx.0, align 1 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %cmp = icmp eq i64 %indvars.iv.next, 8 br i1 %cmp, label %for.cond.cleanup, label %for.body }

This is really wrong as SRA is completely throwing away @s.1 which is considered "constant". I'll think about the best way to fix this. We might consider preventing IP-SRA in cases like this (we're getting via a GEP some bytes of the struct inside a loop, so when we replace the struct with two arrays the loop needs to be rewritten altogether [and from what I can see there's no such logic in SRA]).

llvmbot commented 7 years ago

it's IPO/GlobalOpt.

$ ./opt meh.ll | ./lli
a:0 1 2 3
b:4 5 6 7

$ ./opt -globalopt meh.ll | ./lli
a:0 1 2 3
b:0 0 0 0

A little reduced:

%struct.anon = type { [4 x i8], [4 x i8] }

@&#8203;s = internal global %struct.anon zeroinitializer, align 1
@.str = private unnamed_addr constant [15 x i8] c"a:%x %x %x %x\0A\00", align 1
@.str.1 = private unnamed_addr constant [15 x i8] c"b:%x %x %x %x\0A\00", align 1

; Function Attrs: nounwind uwtable
define i32 @&#8203;main() #&#8203;0 {
entry:
  tail call fastcc void @&#8203;init_arr()
  %0 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @&#8203;s, i64 0, i32 0, i64 0),
align 1
  %conv = zext i8 %0 to i32
  %1 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @&#8203;s, i64 0, i32 0, i64 1),
align 1
  %conv1 = zext i8 %1 to i32
  %2 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @&#8203;s, i64 0, i32 0, i64 2),
align 1
  %conv2 = zext i8 %2 to i32
  %3 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @&#8203;s, i64 0, i32 0, i64 3),
align 1
  %conv3 = zext i8 %3 to i32
  %call = tail call i32 (i8*, ...) @&#8203;printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str,
i64 0, i64 0), i32 %conv, i32 %conv1, i32 %conv2, i32 %conv3)
  %4 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @&#8203;s, i64 0, i32 1, i64 0),
align 1
  %conv4 = zext i8 %4 to i32
  %5 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @&#8203;s, i64 0, i32 1, i64 1),
align 1
  %conv5 = zext i8 %5 to i32
  %6 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @&#8203;s, i64 0, i32 1, i64 2),
align 1
  %conv6 = zext i8 %6 to i32
  %7 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @&#8203;s, i64 0, i32 1, i64 3),
align 1
  %conv7 = zext i8 %7 to i32
  %call8 = tail call i32 (i8*, ...) @&#8203;printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]*
@.str.1, i64 0, i64 0), i32 %conv4, i32 %conv5, i32 %conv6, i32 %conv7)
  ret i32 0
}

; Function Attrs: norecurse nounwind uwtable
define internal fastcc void @&#8203;init_arr() unnamed_addr #&#8203;1 {
entry:
  br label %for.body

for.cond.cleanup:                                 ; preds = %for.body
  ret void

for.body:                                         ; preds = %for.body, %entry
  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds %struct.anon, %struct.anon* @&#8203;s, i64 0, i32 0, i64 %indvars.iv
  %0 = trunc i64 %indvars.iv to i8
  store i8 %0, i8* %arrayidx, align 1
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %cmp = icmp eq i64 %indvars.iv.next, 8
  br i1 %cmp, label %for.cond.cleanup, label %for.body
}

; Function Attrs: nounwind
declare i32 @&#8203;printf(i8* nocapture readonly, ...) local_unnamed_addr #&#8203;2