Quuxplusone / LLVMBugzillaTest

0 stars 0 forks source link

ThinLTO doesn't know about constant return values #32576

Open Quuxplusone opened 7 years ago

Quuxplusone commented 7 years ago
Bugzilla Link PR33604
Status NEW
Importance P enhancement
Reported by Davide Italiano (ditaliano@apple.com)
Reported on 2017-06-26 15:29:25 -0700
Last modified on 2017-10-18 04:09:35 -0700
Version trunk
Hardware PC All
CC charles.saternos@gmail.com, davidxl@google.com, joker.eph@gmail.com, llvm-bugs@lists.llvm.org, peter@pcc.me.uk, tejohnson@google.com, vivekvpandya@gmail.com
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also
As Charles, our GSoC student working on ThinLTO was looking for opportunities
to improve ThinLTO, here's something that came to my mind.

I'm not sure whether we want to go in this direction, but it's a start.

$ cat a.c
__attribute__((noinline))
int patatino(void) {
  return 47;
}

$ cat b.c
extern int patatino(void);

int main(void) {
  return patatino();
}

$ ../clang a.c b.c -flto -fuse-ld=lld -Wl,-save-temps -o patatino-lto
$ objdump -d patatino-lto

[...]

0000000000201190 <main>:
  201190:       55                      push   %rbp
  201191:       48 89 e5                mov    %rsp,%rbp
  201194:       48 83 ec 10             sub    $0x10,%rsp
  201198:       c7 45 fc 00 00 00 00    movl   $0x0,-0x4(%rbp)
  20119f:       e8 dc ff ff ff          callq  201180 <patatino>
  2011a4:       b8 2f 00 00 00          mov    $0x2f,%eax
  2011a9:       48 83 c4 10             add    $0x10,%rsp
  2011ad:       5d                      pop    %rbp
  2011ae:       c3                      retq

With ThinLTO:

$ ../clang a.c b.c -flto=thin -fuse-ld=lld -Wl,-save-temps -o patatino-thin
$ $ objdump -d ./patatino-thin

[...]

0000000000201190 <main>:
  201190:       55                      push   %rbp
  201191:       48 89 e5                mov    %rsp,%rbp
  201194:       48 83 ec 10             sub    $0x10,%rsp
  201198:       c7 45 fc 00 00 00 00    movl   $0x0,-0x4(%rbp)
  20119f:       e8 dc ff ff ff          callq  201180 <patatino>
  2011a4:       48 83 c4 10             add    $0x10,%rsp
  2011a8:       5d                      pop    %rbp
  2011a9:       c3                      retq
Quuxplusone commented 7 years ago

This problem could be phrased more generally as ThinLTO sumamry hasn't enough information to perform interprocedural constant propagation (both propagating constant values into arguments & return values), but solving the problem for return values should allow us to solve the problem for arguments with a similar solution.

Quuxplusone commented 7 years ago

What strikes me the most right now is that the regular LTO still has a call, why is that?

Quuxplusone commented 7 years ago
We should indeed. If the two functions are in the same TU we throw the function
away.
My wild guess is that we're somehow missing some attribute that allows us to
prove the call has no side effects and therefore can be removed.

https://godbolt.org/g/ctH7hT

I think this is an equally interesting but separate bug. I'll consider taking a
look in the next days.
Quuxplusone commented 7 years ago
So, that actually depends on what you pass as optimization level for the per-TU
pipeline.

Nothing:

$ ../clang a.c b.c -flto -fuse-ld=lld -Wl,-save-temps -o patatino-lto
$ ../llvm-dis patatino-lto.0.5.precodegen.bc -o -
; ModuleID = 'patatino-lto.0.5.precodegen.bc'
source_filename = "ld-temp.o"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: noinline nounwind optnone uwtable
define internal fastcc void @patatino() unnamed_addr #0 {
entry:
  ret void
}

; Function Attrs: noinline nounwind optnone uwtable
define i32 @main() local_unnamed_addr #0 {
entry:
  %retval = alloca i32, align 4
  store i32 0, i32* %retval, align 4
  call fastcc void @patatino()
  ret i32 47
}

attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-
sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false"
"no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-
math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-
zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-
size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
"unsafe-fp-math"="false" "use-soft-float"="false" }

(this is because attribute #0 contains `optnone`).

with -O1:

$ ../clang -O1 a.c b.c -flto -fuse-ld=lld -Wl,-save-temps -o patatino-lto
$ ../llvm-dis patatino-lto.0.5.precodegen.bc -o -
; ModuleID = 'patatino-lto.0.5.precodegen.bc'
source_filename = "ld-temp.o"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: norecurse nounwind readnone uwtable
define i32 @main() local_unnamed_addr #0 {
entry:
  ret i32 47
}

attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-
sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false"
"no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-
tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-
trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64"
"target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-
soft-float"="false" }