Quuxplusone / LLVMBugzillaTest

0 stars 0 forks source link

x86_64 JIT emits PC-relative relocation in non-PC-relative instruction #6876

Closed Quuxplusone closed 14 years ago

Quuxplusone commented 14 years ago
Bugzilla Link PR6429
Status RESOLVED FIXED
Importance P normal
Reported by Ralf Karrenberg (karrenberg@cs.uni-saarland.de)
Reported on 2010-02-26 03:42:41 -0800
Last modified on 2010-03-08 22:18:53 -0800
Version trunk
Hardware PC Linux
CC clattner@nondot.org, jfonseca@vmware.com, llvm-bugs@lists.llvm.org, llvm@sunfishcode.online
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also
seems like I stumbled upona bug in x86_64 codegen in the latest svn trunk.
The problem did not appear in the last revision I used (r95086).

My C source looks like this:

#include <stdio.h>

int main(int argc, char** argv) {
    double a[2] = {1.14, 3.134};

    printf("unknown side-effect\n");
    for (unsigned i=0; i<2; ++i) {
        const double bad = a[i] / (a[i] == 0.0f ? 0.00001f : a[i]);
        printf("%f\n", bad);
    }
    return 0;
}

the expected output here is the following (e.g. gcc 4.4.1 or icc 11.1):
unknown side-effect
1.000000
1.000000

compile with llvm-gcc:
llvm-gcc -o test.bc -c -emit-llvm -Wall test.cpp

'lli test.bc' produces the following output:
unknown side-effect
0  lli             0x0000000000b8396f
1  lli             0x0000000000b8415d
2  libpthread.so.0 0x00007fdd88cc1190
3  libpthread.so.0 0x00007fdd8903b0c5
Stack dump:
0.      Program arguments: lli test.bc
Segmentation fault

gdb says it crashes at the double-comparison with 0, which seems obvious
looking at the address:
0x00007ffff7f4b094 <main+124>:  ucomisd 0xffffffffffffff73,%xmm0

part of the disassembly between the two printfs:
0x00007ffff7f4b05d <main+69>:   callq  *%rax
0x00007ffff7f4b05f <main+71>:   movl   $0x0,0xc(%rsp)
0x00007ffff7f4b067 <main+79>:   mov    $0x7ffff7fd9034,%rbx
0x00007ffff7f4b071 <main+89>:   mov    $0x7ffff6ef4b50,%r14
0x00007ffff7f4b07b <main+99>:   mov    $0x3ee4f8b580000000,%r15
0x00007ffff7f4b085 <main+109>:  jmpq   0x7ffff7f4b0dc <main+196>
0x00007ffff7f4b08a <main+114>:  mov    0xc(%rsp),%eax
0x00007ffff7f4b08e <main+118>:  movsd  0x10(%rsp,%rax,8),%xmm0
0x00007ffff7f4b094 <main+124>:  ucomisd 0xffffffffffffff73,%xmm0
0x00007ffff7f4b09d <main+133>:  setnp  %al
0x00007ffff7f4b0a0 <main+136>:  sete   %cl
0x00007ffff7f4b0a3 <main+139>:  test   %al,%cl
0x00007ffff7f4b0a5 <main+141>:  jne    0x7ffff7f4b0c0 <main+168>
0x00007ffff7f4b0ab <main+147>:  mov    0xc(%rsp),%eax
0x00007ffff7f4b0af <main+151>:  movsd  0x10(%rsp,%rax,8),%xmm1
0x00007ffff7f4b0b5 <main+157>:  movsd  %xmm1,0x20(%rsp)
0x00007ffff7f4b0bb <main+163>:  jmpq   0x7ffff7f4b0c5 <main+173>
0x00007ffff7f4b0c0 <main+168>:  mov    %r15,0x20(%rsp)
0x00007ffff7f4b0c5 <main+173>:  divsd  0x20(%rsp),%xmm0
0x00007ffff7f4b0cb <main+179>:  movsd  %xmm0,(%rsp)
0x00007ffff7f4b0d0 <main+184>:  mov    %rbx,%rdi
0x00007ffff7f4b0d3 <main+187>:  mov    $0x1,%al
0x00007ffff7f4b0d5 <main+189>:  callq  *%r14

the same happens when compiling with llvm-gcc and any optimization flag as well
as with clang -O0.
compiling with clang -O1 to -O3 works fine, though, so I guess it is something
in the generated bitcode, which looks like this for llvm-gcc -O3:

; ModuleID = 'test.bc'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-
f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"

@.str = private constant [20 x i8] c"unknown side-effect\00", align 1 ; <[20 x
i8]*> [#uses=1]
@.str1 = private constant [4 x i8] c"%f\0A\00", align 1 ; <[4 x i8]*> [#uses=1]

define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {
bb.nph:
  %a = alloca [2 x double], align 8               ; <[2 x double]*> [#uses=3]
  %0 = getelementptr inbounds [2 x double]* %a, i64 0, i64 0 ; <double*> [#uses=1]
  store double 1.140000e+00, double* %0, align 8
  %1 = getelementptr inbounds [2 x double]* %a, i64 0, i64 1 ; <double*> [#uses=1]
  store double 3.134000e+00, double* %1, align 8
  %2 = call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @.str, i64 0, i64 0)) ; <i32> [#uses=0]
  br label %bb

bb:                                               ; preds = %bb3.bb_crit_edge,
%bb.nph
  %3 = phi double [ 1.140000e+00, %bb.nph ], [ %.pre, %bb3.bb_crit_edge ] ; <double> [#uses=3]
  %indvar = phi i64 [ 1, %bb.nph ], [ %phitmp, %bb3.bb_crit_edge ] ; <i64> [#uses=3]
  %4 = fcmp une double %3, 0.000000e+00           ; <i1> [#uses=1]
  %iftmp.2.0 = select i1 %4, double %3, double 0x3EE4F8B580000000 ; <double> [#uses=1]
  %5 = fdiv double %3, %iftmp.2.0                 ; <double> [#uses=1]
  %6 = call i32 (i8*, ...)* @printf(i8* noalias getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), double %5) ; <i32> [#uses=0]
  %exitcond = icmp eq i64 %indvar, 2              ; <i1> [#uses=1]
  br i1 %exitcond, label %bb5, label %bb3.bb_crit_edge

bb3.bb_crit_edge:                                 ; preds = %bb
  %scevgep.phi.trans.insert = getelementptr [2 x double]* %a, i64 0, i64 %indvar ; <double*> [#uses=1]
  %.pre = load double* %scevgep.phi.trans.insert, align 8 ; <double> [#uses=1]
  %phitmp = add i64 %indvar, 1                    ; <i64> [#uses=1]
  br label %bb

bb5:                                              ; preds = %bb
  ret i32 0
}

declare i32 @puts(i8* nocapture) nounwind

declare i32 @printf(i8* noalias nocapture, ...) nounwind

this is what clang -O3 builds:

; ModuleID = 'test.bc'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-
f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"

@_ZZ4mainE1a = internal constant [2 x double] [double 1.140000e+00, double
3.134000e+00], align 8 ; <[2 x double]*> [#uses=1]
@.str1 = private constant [4 x i8] c"%f\0A\00"    ; <[4 x i8]*> [#uses=1]
@str = internal constant [20 x i8] c"unknown side-effect\00" ; <[20 x i8]*>
[#uses=1]

define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {
bb.nph:
  %puts = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str, i64 0, i64 0)) ; <i32> [#uses=0]
  br label %for.body

for.body:                                         ; preds = %for.body, %bb.nph
  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] ; <i64> [#uses=2]
  %arrayidx15 = getelementptr [2 x double]* @_ZZ4mainE1a, i64 0, i64 %indvar ; <double*> [#uses=1]
  %tmp5 = load double* %arrayidx15                ; <double> [#uses=2]
  %div = fdiv double %tmp5, %tmp5                 ; <double> [#uses=1]
  %call18 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), double %div) ; <i32> [#uses=0]
  %indvar.next = add i64 %indvar, 1               ; <i64> [#uses=2]
  %exitcond = icmp eq i64 %indvar.next, 2         ; <i1> [#uses=1]
  br i1 %exitcond, label %for.end, label %for.body

for.end:                                          ; preds = %for.body
  ret i32 0
}

declare i32 @printf(i8* nocapture, ...) nounwind

declare i32 @puts(i8* nocapture) nounwind
Quuxplusone commented 14 years ago

Looks like a JIT relocation/encoding bug. Within LLI, the instruction looks like this:

UCOMISDrm %XMM0, %reg0, 1, %reg0, <cp#0>, %reg0, %EFLAGS

So it looks like the JIT is patching in a bogus address for the constant pool reference.

Quuxplusone commented 14 years ago

Specifically, it looks like it's using a PC-relative encoding, which is wrong here. This decision seems to happen in code substantially modified by r95693. Chris, can you investigate?

Quuxplusone commented 14 years ago

P5 priority and normal severity seems an understatement to me.

With the testing I've done here with llvmpipe (a 3D software renderer based on LLVM that's part of Mesa3D project) the trunk's x64_64 JIT is completely useless.

I can't test anything else here with the trunk.

Quuxplusone commented 14 years ago

priority and severity don't mean anything in this bugzilla.

Quuxplusone commented 14 years ago

I can't reproduce this failure (on darwin/x86-64), all of these modes work fine for me, including the .ll file pasted in the bug.

Quuxplusone commented 14 years ago
I'm seeing this GDB disassembly of the generated code:

0x0000000101c2c099: jmpq   0x101c2c0a7
0x0000000101c2c09e: movsd  0x8(%rsp,%rbx,8),%xmm0
0x0000000101c2c0a4: inc    %rbx
0x0000000101c2c0a7: ucomisd -0x8f(%rip),%xmm0        # 0x101c2c020
0x0000000101c2c0af: setp   %al
0x0000000101c2c0b2: setne  %cl
Quuxplusone commented 14 years ago
And this is the encoding I'm seeing:

(gdb) x/8bx 0x0000000101c2c0a7
0x101c2c0a7:    0x66    0x0f    0x2e    0x05    0x71    0xff    0xff    0xff

Please include the encoding the you're seeing the x86-64 jit produce.
Quuxplusone commented 14 years ago

Dan, if the machineinstr looks like "UCOMISDrm %XMM0, %reg0, 1, %reg0, <cp#0>, %reg0, %EFLAGS" then the bug is probably that the base register is not X86::RIP. This is not a code emitter bug, this is something wrong before code emitter.

Quuxplusone commented 14 years ago
Yes, this is not jit specific at all.  You can see this with the static codegen:

$ llc -mtriple=x86_64-linux-gnu t.bc -o -
...
        ucomisd .LCPI1_2, %xmm0
        setp    %al
        setne   %cl

$ llc  t.bc -o -
...
        ucomisd LCPI1_2(%rip), %xmm0
        setp    %al
        setne   %cl
Quuxplusone commented 14 years ago
It looks like the problem is this code in X86InstrInfo.cpp:

  switch (LoadMI->getOpcode()) {
  case X86::V_SET0:
  case X86::V_SETALLONES:
  case X86::FsFLD0SD:
  case X86::FsFLD0SS: {
    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
    // Create a constant-pool entry and operands to load from it.

    // x86-32 PIC requires a PIC base register for constant pools.
    unsigned PICBase = 0;
    if (TM.getRelocationModel() == Reloc::PIC_) {
      if (TM.getSubtarget<X86Subtarget>().is64Bit())
        PICBase = X86::RIP;
      else
        // FIXME: PICBase = TM.getInstrInfo()->getGlobalBaseReg(&MF);
        // This doesn't work for several reasons.
        // 1. GlobalBaseReg may have been spilled.
        // 2. It may not be live at MI.
        return NULL;
    }

The X86TargetLowering::LowerConstantPool function ignores the relocation model,
it uses:

if (Subtarget->isPICStyleRIPRel() &&
      (M == CodeModel::Small || M == CodeModel::Kernel))
    .. use rip ..

Dan, can you take a look at this?  This is all code that you and evan wrote.
Quuxplusone commented 14 years ago
(In reply to comment #9)
> Yes, this is not jit specific at all.  You can see this with the static
> codegen:
>
> $ llc -mtriple=x86_64-linux-gnu t.bc -o -
> ...
>         ucomisd .LCPI1_2, %xmm0
>         setp    %al
>         setne   %cl
>
> $ llc  t.bc -o -
> ...
>         ucomisd LCPI1_2(%rip), %xmm0
>         setp    %al
>         setne   %cl

This doesn't indicate a bug (encoding micro-optimization aside). llc is hard-
wired to emit only PIC on Darwin x86-64. On other targets it defaults to
static. In static mode, it's not necessary to use RIP-relative addressing.
Quuxplusone commented 14 years ago

That may be, but I have no plans to rewrite the JIT encoder this close to the 2.7 release. If you don't want to fix this for 2.7, I'm fine waiting until 2.8. 2.8 will hopefully have the entire JIT encoder replaced with the existing .o file writer stuff.

Quuxplusone commented 14 years ago
I was curious how the JIT copes with code like

    x ? 0.1 : 2.3

because this typically codegens as LCPI(,%rax,8), and it's not possible to
encode addresses using the scaled index with RIP.  The answer is that the JIT
defaults to the large code model, so the symbol gets emitted in a MOV64ri,
which bypasses this issue.

Given that, the bigger problem here is that that X86InstrInfo.cpp code wasn't
aware of the code model.  This is now fixed on trunk in r98042.

It's also the case that the binary code emitter will need more information (a
TargetFlag bit on the MachineOperand?) in order to be able to emit non-PC-
relative encodings, but that's a separate issue.
Quuxplusone commented 14 years ago

The new mc encoder should already have the targetflags. If it makes sense to pull into 2.7, please point tanya to the specific revs to pull in.