llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.77k stars 11.89k forks source link

Extra stack load/store generated for a volatile {i16, i16} store #42054

Open ramosian-glider opened 5 years ago

ramosian-glider commented 5 years ago
Bugzilla Link 42709
Version trunk
OS Linux
CC @DougGregor,@RKSimon,@zygoloid

Extended Description

For the following program:

$ cat tb.c
typedef struct { short v1, v2;} st_t;
void foo(st_t a) {
  volatile st_t b;
  b = a;
}

GCC generates a single store to a stack slot:

$ gcc tb.c -O2 -c
$ objdump -d tb.o
...
0000000000000000 <foo>:
   0: 89 7c 24 fc          mov    %edi,-0x4(%rsp)
   4: c3                    retq  

, whereas Clang uses an extra stack slot to store %rdi for no reason:

$ clang tb.c -O2 -c
$ objdump -d tb.o...
0000000000000000 <foo>:
   0: 89 7c 24 f8          mov    %edi,-0x8(%rsp)
   4: 8b 44 24 f8          mov    -0x8(%rsp),%eax
   8: 89 44 24 fc          mov    %eax,-0x4(%rsp)
   c: c3                    retq  

According to the generated IR Clang chose to use a volatile load for that extra slot:

; Function Attrs: nounwind uwtable
define dso_local void @&#8203;foo(i32 %a.coerce) local_unnamed_addr #&#8203;0 {
entry:
  %a.sroa.0 = alloca i32, align 4
  %b.sroa.0 = alloca i32, align 4
  store i32 %a.coerce, i32* %a.sroa.0, align 4
  %b.sroa.0.0.b.0..sroa_cast = bitcast i32* %b.sroa.0 to i8* 
  call void @&#8203;llvm.lifetime.start.p0i8(i64 4, i8* nonnull %b.sroa.0.0.b.0..sroa_cast)
  %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0.copyload = load volatile i32, i32* %a.sroa.0, align 4
  store volatile i32 %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0.copyload, i32* %b.sroa.0, align 4
  call void @&#8203;llvm.lifetime.end.p0i8(i64 4, i8* nonnull %b.sroa.0.0.b.0..sroa_cast)
  ret void
}
llvmbot commented 3 months ago

@llvm/issue-subscribers-backend-x86

Author: Alexander Potapenko (ramosian-glider)

| | | | --- | --- | | Bugzilla Link | [42709](https://llvm.org/bz42709) | | Version | trunk | | OS | Linux | | CC | @DougGregor,@RKSimon,@zygoloid | ## Extended Description For the following program: $ cat tb.c typedef struct { short v1, v2;} st_t; void foo(st_t a) {   volatile st_t b;   b = a; } GCC generates a single store to a stack slot: $ gcc tb.c -O2 -c $ objdump -d tb.o ... 0000000000000000 <foo>:    0: 89 7c 24 fc          mov    %edi,-0x4(%rsp)    4: c3                    retq   , whereas Clang uses an extra stack slot to store %rdi for no reason: $ clang tb.c -O2 -c $ objdump -d tb.o... 0000000000000000 <foo>:    0: 89 7c 24 f8          mov    %edi,-0x8(%rsp)    4: 8b 44 24 f8          mov    -0x8(%rsp),%eax    8: 89 44 24 fc          mov    %eax,-0x4(%rsp)    c: c3                    retq   According to the generated IR Clang chose to use a volatile load for that extra slot: ; Function Attrs: nounwind uwtable define dso_local void @&#8203;foo(i32 %a.coerce) local_unnamed_addr #&#8203;0 { entry: %a.sroa.0 = alloca i32, align 4 %b.sroa.0 = alloca i32, align 4 store i32 %a.coerce, i32* %a.sroa.0, align 4 %b.sroa.0.0.b.0..sroa_cast = bitcast i32* %b.sroa.0 to i8* call void @&#8203;llvm.lifetime.start.p0i8(i64 4, i8* nonnull %b.sroa.0.0.b.0..sroa_cast) %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0.copyload = load volatile i32, i32* %a.sroa.0, align 4 store volatile i32 %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0.copyload, i32* %b.sroa.0, align 4 call void @&#8203;llvm.lifetime.end.p0i8(i64 4, i8* nonnull %b.sroa.0.0.b.0..sroa_cast) ret void } - maybe that prevented DSE from removing the dead store.