LLVM is unable to optimize a call to pow() with complex arguments

Quuxplusone commented 7 years ago


Bugzilla Link	PR31510
Status	NEW
Importance	P normal
Reported by	Davide Italiano (ditaliano@apple.com)
Reported on	2017-01-02 05:26:22 -0800
Last modified on	2018-02-11 02:39:09 -0800
Version	trunk
Hardware	PC All
CC	drraph@gmail.com, filcab@gmail.com, llvm-bugs@lists.llvm.org, simon.f.whittaker@gmail.com, spatel+llvm@rotateright.com
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also	PR31866

Testcase:

#include <complex>

std::complex<double> foo()
{
    using namespace std::complex_literals;
    std::complex<double> z2 = std::pow(1i, 2);
    return z2;
}

with (-O3 -std=c++14)

Clang 3.9 output:

.LCPI0_0:
        .quad   4607182418800017408     # double 1
foo():                                # @foo()
        pushq   %rbx
        subq    $16, %rsp
        movl    $2, %ebx
        movsd   .LCPI0_0(%rip), %xmm0   # xmm0 = mem[0],zero
        xorpd   %xmm1, %xmm1
        movapd  %xmm0, %xmm5
        xorpd   %xmm4, %xmm4
        jmp     .LBB0_1
.LBB0_9:                                #   in Loop: Header=BB0_1 Depth=1
        movapd  %xmm7, %xmm0
        movapd  %xmm6, %xmm1
        movapd  %xmm4, %xmm2
        movapd  %xmm5, %xmm3
        movsd   %xmm4, 8(%rsp)          # 8-byte Spill
        movsd   %xmm5, (%rsp)           # 8-byte Spill
        callq   __muldc3
        movsd   (%rsp), %xmm5           # 8-byte Reload
        movsd   8(%rsp), %xmm4          # 8-byte Reload
.LBB0_1:                                # =>This Loop Header: Depth=1
        movapd  %xmm0, %xmm7
        movapd  %xmm1, %xmm6
        jmp     .LBB0_2
.LBB0_5:                                #   in Loop: Header=BB0_2 Depth=2
        movapd  %xmm0, %xmm2
        movapd  %xmm1, %xmm3
        movsd   %xmm6, 8(%rsp)          # 8-byte Spill
        movsd   %xmm7, (%rsp)           # 8-byte Spill
        callq   __muldc3
        movsd   (%rsp), %xmm7           # 8-byte Reload
        movsd   8(%rsp), %xmm6          # 8-byte Reload
        movapd  %xmm0, %xmm4
        movapd  %xmm1, %xmm5
        jmp     .LBB0_6
.LBB0_2:                                #   Parent Loop BB0_1 Depth=1
        movapd  %xmm4, %xmm0
        movapd  %xmm5, %xmm1
        shrl    %ebx
        je      .LBB0_10
        movapd  %xmm0, %xmm4
        mulsd   %xmm4, %xmm4
        movapd  %xmm1, %xmm2
        mulsd   %xmm2, %xmm2
        movapd  %xmm1, %xmm5
        mulsd   %xmm0, %xmm5
        subsd   %xmm2, %xmm4
        addsd   %xmm5, %xmm5
        ucomisd %xmm4, %xmm4
        jnp     .LBB0_6
        ucomisd %xmm5, %xmm5
        jp      .LBB0_5
.LBB0_6:                                #   in Loop: Header=BB0_2 Depth=2
        testb   $1, %bl
        je      .LBB0_2
        movapd  %xmm7, %xmm0
        mulsd   %xmm4, %xmm0
        movapd  %xmm6, %xmm2
        mulsd   %xmm5, %xmm2
        movapd  %xmm7, %xmm3
        mulsd   %xmm5, %xmm3
        movapd  %xmm6, %xmm1
        mulsd   %xmm4, %xmm1
        subsd   %xmm2, %xmm0
        addsd   %xmm3, %xmm1
        ucomisd %xmm0, %xmm0
        jnp     .LBB0_1
        ucomisd %xmm1, %xmm1
        jnp     .LBB0_1
        jmp     .LBB0_9
.LBB0_10:
        movapd  %xmm7, %xmm0
        movapd  %xmm6, %xmm1
        addq    $16, %rsp
        popq    %rbx
        retq

gcc7 output:

foo():
        subq    $8, %rsp
        pxor    %xmm2, %xmm2
        movsd   .LC0(%rip), %xmm3
        movapd  %xmm2, %xmm0
        movapd  %xmm3, %xmm1
        call    __muldc3
        pxor    %xmm3, %xmm3
        movsd   .LC0(%rip), %xmm2
        call    __muldc3
        addq    $8, %rsp
        ret
.LC0:
        .long   0
        .long   1072693248

Compiler explorer link if you want to play at home:
https://godbolt.org/g/GGfaei

I think this is not limited only to pow() but it's a place to start (if anybody
is interested)

Quuxplusone commented 7 years ago

Another example of not great output for complex:

#include <complex.h>
complex double f(complex double x) {
  return x*x;
}

LLVM 3.9 output:

f:                                      # @f
        movapd  xmm3, xmm1
        movapd  xmm2, xmm0
        mulsd   xmm0, xmm0
        movapd  xmm4, xmm3
        mulsd   xmm4, xmm4
        movapd  xmm1, xmm2
        mulsd   xmm1, xmm3
        subsd   xmm0, xmm4
        addsd   xmm1, xmm1
        ucomisd xmm0, xmm0
        jnp     .LBB0_3
        ucomisd xmm1, xmm1
        jp      .LBB0_2
.LBB0_3:
        ret
.LBB0_2:
        push    rax
        movapd  xmm0, xmm2
        movapd  xmm1, xmm3
        call    __muldc3
        add     rsp, 8
        ret

gcc output:

f:
        movapd  xmm3, xmm1
        movapd  xmm2, xmm0
        jmp     __muldc3

(It is equally not great if you pass -march=avx2, FWIW)

Quuxplusone commented 7 years ago

(this is at -O3)

Quuxplusone commented 7 years ago

The first example with -ffast-math shows a slightly more dramatic difference:

clang 3.9 -O3 -ffast-math -std=c++14

.LCPI0_0:
        .quad   4607182418800017408     # double 1
foo():                                # @foo()
        mov     eax, 2
        movsd   xmm0, qword ptr [rip + .LCPI0_0] # xmm0 = mem[0],zero
        xorpd   xmm1, xmm1
        movapd  xmm3, xmm0
        xorpd   xmm2, xmm2
        jmp     .LBB0_1
.LBB0_4:                                #   in Loop: Header=BB0_1 Depth=1
        movapd  xmm4, xmm3
        mulsd   xmm4, xmm0
        movapd  xmm5, xmm2
        mulsd   xmm5, xmm1
        addsd   xmm5, xmm4
        mulsd   xmm0, xmm2
        mulsd   xmm1, xmm3
        subsd   xmm0, xmm1
        movapd  xmm1, xmm5
.LBB0_1:                                # =>This Loop Header: Depth=1
        movapd  xmm4, xmm3
.LBB0_2:                                #   Parent Loop BB0_1 Depth=1
        shr     eax
        je      .LBB0_5
        movapd  xmm3, xmm2
        addsd   xmm3, xmm3
        mulsd   xmm3, xmm4
        mulsd   xmm2, xmm2
        mulsd   xmm4, xmm4
        subsd   xmm2, xmm4
        test    al, 1
        movapd  xmm4, xmm3
        je      .LBB0_2
        jmp     .LBB0_4
.LBB0_5:
        ret

GCC:
foo():
        pxor    %xmm1, %xmm1
        movsd   .LC0(%rip), %xmm0
        ret
.LC0:
        .long   0
        .long   -1074790400

Quuxplusone / LLVMBugzillaTest

LLVM is unable to optimize a call to pow() with complex arguments #30483