Interprocedural scalar replacement of aggregates miscompile

Quuxplusone commented 7 years ago


Bugzilla Link	PR34132
Status	NEW
Importance	P enhancement
Reported by	Konstantin Belochapka (konstantin.belochapka@sony.com)
Reported on	2017-08-08 19:39:24 -0700
Last modified on	2020-09-01 14:43:14 -0700
Version	unspecified
Hardware	PC Windows NT
CC	andrea_dibiagio@sn.scee.net, chandlerc@gmail.com, ditaliano@apple.com, efriedma@quicinc.com, greg.bedwell@sony.com, hans@chromium.org, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, russell_gallop@sn.scee.net, spatel+llvm@rotateright.com, warren_ristow@playstation.sony.com, yuanfang.chen@sony.com
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also

source.c:

#include <stdio.h>

struct {
  unsigned char a[4];
  unsigned char b[4];
} s;

static void init_arr(volatile void *data, unsigned char size) {
  unsigned char *bytes = (unsigned char *)data;
  for (unsigned char i = 0; i != size; ++i) {
    bytes[i] = i;
  }
}

int main() {
  init_arr(&s, sizeof(s) );
  printf("a:%x %x %x %x\n", s.a[0], s.a[1], s.a[2], s.a[3]);
  printf("b:%x %x %x %x\n", s.b[0], s.b[1], s.b[2], s.b[3]);
}

clang -Oz -flto  source.c
or
clang -O0 -flto  source.c
or
clang -O1 -flto  source.c

<main>:
    xor    %eax,%eax
    lea    0x7fcb(%rip),%rcx        # 80a8 <__preinit_array_end>
    jmp    e5 <__start__Zdynsym+0x5>
    mov    %al,(%rax,%rcx,1)
    inc    %rax
    cmp    $0x8,%rax
    jne    df <__dynstr_end+0x6>
    push   %rbp
    mov    %rsp,%rbp
    movzbl 0x7fb2(%rip),%esi        # 80a8 <__preinit_array_end>
    movzbl 0x7fac(%rip),%edx        # 80a9 <__preinit_array_end+0x1>
    movzbl 0x7fa6(%rip),%ecx        # 80aa <__preinit_array_end+0x2>
    movzbl 0x7f9f(%rip),%r8d        # 80ab <__preinit_array_end+0x3>

    lea    0xdd(%rip),%rdi        # 1f0 <.L.str>
    xor    %eax,%eax
    callq  1a0 </PLprintf>
    lea    0xde(%rip),%rdi        # 1ff <.L.str.1>
    xor    %esi,%esi
    xor    %edx,%edx
    xor    %ecx,%ecx
    xor    %r8d,%r8d
    xor    %eax,%eax
    callq  1a0 </PLprintf>
    xor    %eax,%eax
    pop    %rbp
    retq

With LTO optimization clang assumes that s.b[0] == s.b[1] == s.b[2] == s.b[3]
== 0 and generates all zero values for the second printf().

With optimization level -O2 or -O3 and -flto, clang produces correct code:

<main>:
    push   %rbp
    mov    %rsp,%rbp
    lea    0xf5(%rip),%rdi        # 1e0 <.L.str>
    mov    $0x0,%esi
    mov    $0x1,%edx
    mov    $0x2,%ecx
    mov    $0x3,%r8d
    xor    %eax,%eax
    callq  190 </PLprintf>
    lea    0xe1(%rip),%rdi        # 1ef <.L.str.1>
    mov    $0x4,%esi
    mov    $0x5,%edx
    mov    $0x6,%ecx
    mov    $0x7,%r8d
    xor    %eax,%eax
    callq  190 </PLprintf>
    xor    %eax,%eax
    pop    %rbp
    retq

Quuxplusone commented 7 years ago

it's IPO/GlobalOpt.

$ ./opt meh.ll | ./lli
a:0 1 2 3
b:4 5 6 7

$ ./opt -globalopt meh.ll | ./lli
a:0 1 2 3
b:0 0 0 0

A little reduced:

%struct.anon = type { [4 x i8], [4 x i8] }

@s = internal global %struct.anon zeroinitializer, align 1
@.str = private unnamed_addr constant [15 x i8] c"a:%x %x %x %x\0A\00", align 1
@.str.1 = private unnamed_addr constant [15 x i8] c"b:%x %x %x %x\0A\00", align
1

; Function Attrs: nounwind uwtable
define i32 @main() #0 {
entry:
  tail call fastcc void @init_arr()
  %0 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @s, i64 0, i32 0, i64 0),
align 1
  %conv = zext i8 %0 to i32
  %1 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @s, i64 0, i32 0, i64 1),
align 1
  %conv1 = zext i8 %1 to i32
  %2 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @s, i64 0, i32 0, i64 2),
align 1
  %conv2 = zext i8 %2 to i32
  %3 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @s, i64 0, i32 0, i64 3),
align 1
  %conv3 = zext i8 %3 to i32
  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str,
i64 0, i64 0), i32 %conv, i32 %conv1, i32 %conv2, i32 %conv3)
  %4 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @s, i64 0, i32 1, i64 0),
align 1
  %conv4 = zext i8 %4 to i32
  %5 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @s, i64 0, i32 1, i64 1),
align 1
  %conv5 = zext i8 %5 to i32
  %6 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @s, i64 0, i32 1, i64 2),
align 1
  %conv6 = zext i8 %6 to i32
  %7 = load i8, i8* getelementptr inbounds (%struct.anon, %struct.anon* @s, i64 0, i32 1, i64 3),
align 1
  %conv7 = zext i8 %7 to i32
  %call8 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]*
@.str.1, i64 0, i64 0), i32 %conv4, i32 %conv5, i32 %conv6, i32 %conv7)
  ret i32 0
}

; Function Attrs: norecurse nounwind uwtable
define internal fastcc void @init_arr() unnamed_addr #1 {
entry:
  br label %for.body

for.cond.cleanup:                                 ; preds = %for.body
  ret void

for.body:                                         ; preds = %for.body, %entry
  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds %struct.anon, %struct.anon* @s, i64 0, i32 0, i64 %indvars.iv
  %0 = trunc i64 %indvars.iv to i8
  store i8 %0, i8* %arrayidx, align 1
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %cmp = icmp eq i64 %indvars.iv.next, 8
  br i1 %cmp, label %for.cond.cleanup, label %for.body
}

; Function Attrs: nounwind
declare i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #2

Quuxplusone commented 7 years ago

This is a nasty IP-SRA bug. The interprocedural SROA (inside GlobalOpt) replaces

%struct.anon = type { [4 x i8], [4 x i8] }

with:

@s.0 = internal global [4 x i8] zeroinitializer @s.1 = internal global [4 x i8] zeroinitializer

And in @init_arr() : define internal fastcc void @init_arr() unnamed_addr { entry: br label %for.body

for.cond.cleanup: ; preds = %for.body ret void

for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %arrayidx.0 = getelementptr [4 x i8], [4 x i8] @s.0, i32 0, i64 %indvars.iv %0 = trunc i64 %indvars.iv to i8 store i8 %0, i8 %arrayidx.0, align 1 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %cmp = icmp eq i64 %indvars.iv.next, 8 br i1 %cmp, label %for.cond.cleanup, label %for.body }

This is really wrong as SRA is completely throwing away @s.1 which is considered "constant". I'll think about the best way to fix this. We might consider preventing IP-SRA in cases like this (we're getting via a GEP some bytes of the struct inside a loop, so when we replace the struct with two arrays the loop needs to be rewritten altogether [and from what I can see there's no such logic in SRA]).

Quuxplusone commented 7 years ago

This has been there probably since forever (2004), but it's a serious miscompile, so tentatively marking this as blocker.

Quuxplusone commented 7 years ago

CC:ing some people who understand SROA better than I do.
I've been thinking about this a little more, and I think it's going to be hard
making this transformation safe (if not impossible) without rewriting the loop.

My proposal is that of changing the function that checks for safety
[IsUserOfGlobalSafeForSRA()] to give up if the gep indices are not all
constants (currently it just checks for the first two).

This would result in some optimization loss when trying to do partial SRA
interprocedurally (see, e.g. test/Transforms/GlobalOpt/globalsra-partial.ll)
but I'm not sure if it's going to matter a lot in practice.

Happy to hear about alternatives, if you have something in mind.

Quuxplusone commented 7 years ago

Andrea made me notice that actually there's something iffy in the input IR:

define internal fastcc void @init_arr() unnamed_addr #1 {
entry:
  br label %for.body

for.cond.cleanup:                                 ; preds = %for.body
  ret void

for.body:                                         ; preds = %for.body, %entry
  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds %struct.anon, %struct.anon* @s, i64 0, i32 0, i64 %indvars.iv
  %0 = trunc i64 %indvars.iv to i8
  store i8 %0, i8* %arrayidx, align 1
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %cmp = icmp eq i64 %indvars.iv.next, 8
  br i1 %cmp, label %for.cond.cleanup, label %for.body
}

The GEP results in out-of-bounds accesses. I guess this is technically UB? Still, as IPSRA bails out when there are out-of-bounds accesses for arrays we might try to be nice and detect some situations also for struct and prevent the optimization in these cases.

Quuxplusone commented 7 years ago

"inbounds" just means that the result has to be somewhere inside the allocated memory; you're still allowed to index outside the bounds of the GEP's type. (In theory, you could use inrange to drive SROA decisions, but we only support for that in limited circumstances, so not really useful at the moment.)

Quuxplusone commented 7 years ago

(In reply to Davide Italiano from comment #3)
> This has been there probably since forever (2004), but it's a serious
> miscompile, so tentatively marking this as blocker.

It doesn't seem like this is getting fixed for 5.0 :-/ Unblocking.

I'm still open to take a fix if it emerges real soon.

Quuxplusone commented 7 years ago

I tried to look at this one, but I wasn't thrilled about any of the fixes I was
able to come up with.
I guess a short term solution would be still to have what I propose in
comment#4, although I'm not exactly sure about the performance implications.
This code has been generated in my organization using a fuzz tester, so it's
not something seen in the wild. I'd rather not rush a fix in 5.0.

Quuxplusone / LLVMBugzillaTest

Interprocedural scalar replacement of aggregates miscompile #33104