macOS arm64 test failure: atomicrmw.t

shoe42 commented 2 years ago

The atomicrmw.t test fails on macOS arm64 (M1). macOS 12.5, using system clang and Homebrew LLVM 13. Trimmed output:

$ ../build/bin/terra atomicrmw.t
atomicrmw test settings: synscope true, align true, fadd true
atomicrmw.t:11:         terra atomic_add(x : &int32,y : int32,z : int32,w : int32,u : int32) : {}
atomicrmw.t:12:             atomicrmw("add", x, y, { syncscope = "", ordering = "seq_cst", align = native, isvolatile = false })
atomicrmw.t:13:             atomicrmw("add", x, z, { syncscope = "", ordering = "acq_rel", align = native, isvolatile = false })
atomicrmw.t:14:             fence({ syncscope = "", ordering = "release" })
atomicrmw.t:18:             atomicrmw("add", x, w, { syncscope = "singlethread", ordering = "monotonic", align = 16, isvolatile = false })
atomicrmw.t:30:             fence({ syncscope = "", ordering = "acquire" })
atomicrmw.t:31:             atomicrmw("add", x, u, { syncscope = "", ordering = "monotonic", align = native, isvolatile = true })
atomicrmw.t:11:         end
definition  {&int32,int32,int32,int32,int32} -> {}
define dso_local void @"$atomic_add"(i32* %0, i32 %1, i32 %2, i32 %3, i32 %4) {
entry:
  %5 = atomicrmw add i32* %0, i32 %1 seq_cst, align 4
  %6 = atomicrmw add i32* %0, i32 %2 acq_rel, align 4
  fence release
  %7 = atomicrmw add i32* %0, i32 %3 syncscope("singlethread") monotonic, align 16
  fence acquire
  %8 = atomicrmw volatile add i32* %0, i32 %4 monotonic, align 4
  ret void
}
assembly for function at address 0x109530000
0x109530000(+0):        ldaxr   w8, [x0]
0x109530004(+4):        add w8, w8, w1
0x109530008(+8):        stlxr   w9, w8, [x0]
0x10953000c(+12):       cbnz    w9, #-12
0x109530010(+16):       ldaxr   w8, [x0]
0x109530014(+20):       add w8, w8, w2
0x109530018(+24):       stlxr   w9, w8, [x0]
0x10953001c(+28):       cbnz    w9, #-12
0x109530020(+32):       dmb ish
0x109530024(+36):       ldxr    w8, [x0]
0x109530028(+40):       add w8, w8, w3
0x10953002c(+44):       stxr    w9, w8, [x0]
0x109530030(+48):       cbnz    w9, #-12
0x109530034(+52):       dmb ishld
0x109530038(+56):       ldxr    w8, [x0]
0x10953003c(+60):       add w8, w8, w4
0x109530040(+64):       stxr    w9, w8, [x0]
0x109530044(+68):       cbnz    w9, #-12
0x109530048(+72):       ret
54321
atomicrmw.t:69:         terra atomic_add_and_return(x : &int32,y : int32) : int32
atomicrmw.t:70:             return atomicrmw("add", x, y, { syncscope = "", ordering = "acq_rel", align = native, isvolatile = false })
atomicrmw.t:69:         end
definition  {&int32,int32} -> int32
define dso_local i32 @"$atomic_add_and_return"(i32* %0, i32 %1) {
entry:
  %2 = atomicrmw add i32* %0, i32 %1 acq_rel, align 4
  ret i32 %2
}
assembly for function at address 0x1096b0000
0x1096b0000(+0):        ldaxr   w8, [x0]
0x1096b0004(+4):        add w9, w8, w1
0x1096b0008(+8):        stlxr   w10, w9, [x0]
0x1096b000c(+12):       cbnz    w10, #-12
0x1096b0010(+16):       mov x0, x8
0x1096b0014(+20):       ret
22
atomicrmw.t:91:         terra atomic_fadd(x : &double,y : double) : {}
atomicrmw.t:92:             atomicrmw("fadd", x, y, { syncscope = "", ordering = "monotonic", align = native, isvolatile = false })
atomicrmw.t:91:         end
definition  {&double,double} -> {}
define dso_local void @"$atomic_fadd"(double* %0, double %1) {
entry:
  %2 = atomicrmw fadd double* %0, double %1 monotonic, align 8
  ret void
}
assembly for function at address 0x1096b8000
0x1096b8000(+0):        ldr d1, [x0]
0x1096b8004(+4):        b   #16
0x1096b8008(+8):        fmov    d1, x10
0x1096b800c(+12):       cmp x10, x9
0x1096b8010(+16):       b.eq    #40
0x1096b8014(+20):       fadd    d2, d1, d0
0x1096b8018(+24):       fmov    x8, d2
0x1096b801c(+28):       fmov    x9, d1
0x1096b8020(+32):       ldaxr   x10, [x0]
0x1096b8024(+36):       cmp x10, x9
0x1096b8028(+40):       b.ne    #-32
0x1096b802c(+44):       stlxr   wzr, x8, [x0]
0x1096b8030(+48):       cbnz    wzr, #-16
0x1096b8034(+52):       b   #-44
0x1096b8038(+56):       ret
21
atomicrmw.t:69:         terra atomic_add_and_return(x : &int32,y : int32) : int32
atomicrmw.t:70:             return atomicrmw("add", x, y, { syncscope = "", ordering = "acq_rel", align = native, isvolatile = false })
atomicrmw.t:69:         end
definition  {&int32,int32} -> int32
define dso_local i32 @"$atomic_add_and_return"(i32* %0, i32 %1) {
entry:
  %2 = atomicrmw add i32* %0, i32 %1 acq_rel, align 4
  ret i32 %2
}
assembly for function at address 0x1096b0000
0x1096b0000(+0):        ldaxr   w8, [x0]
0x1096b0004(+4):        add w9, w8, w1
0x1096b0008(+8):        stlxr   w10, w9, [x0]
0x1096b000c(+12):       cbnz    w10, #-12
0x1096b0010(+16):       mov x0, x8
0x1096b0014(+20):       ret
LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.stlxr
zsh: abort      ../build/bin/terra atomicrmw.t

elliottslaughter commented 2 years ago

Can you confirm that this reduced test reproduces the issue?

terra atomic_add_and_return(x : &int, y : int)
  return terralib.atomicrmw("add", x, y, {ordering = "acq_rel"})
end
atomic_add_and_return:printpretty(false)
atomic_add_and_return:disas()

terra add_and_return()
  var i : int = 1

  var r = atomic_add_and_return(&i, 20)

  return r + i
end

print(add_and_return())
assert(add_and_return() == 22)

shoe42 commented 2 years ago

Looks like that passes:

$ ../build/bin/terra bug603.t
bug603.t:1:             terra atomic_add_and_return(x : &int32,y : int32) : int32
bug603.t:2:                 return atomicrmw("add", x, y, { syncscope = "", ordering = "acq_rel", align = native, isvolatile = false })
bug603.t:1:             end
definition  {&int32,int32} -> int32
define dso_local i32 @"$atomic_add_and_return"(i32* %0, i32 %1) {
entry:
  %2 = atomicrmw add i32* %0, i32 %1 acq_rel, align 4
  ret i32 %2
}
assembly for function at address 0x1096e8000
0x1096e8000(+0):        ldaxr   w8, [x0]
0x1096e8004(+4):        add w9, w8, w1
0x1096e8008(+8):        stlxr   w10, w9, [x0]
0x1096e800c(+12):       cbnz    w10, #-12
0x1096e8010(+16):       mov x0, x8
0x1096e8014(+20):       ret
22

elliottslaughter commented 2 years ago

What about this?

terra atomic_fadd(x : &double, y : double)
  terralib.atomicrmw("fadd", x, y, {ordering = "monotonic"})
end
atomic_fadd:printpretty(false)
atomic_fadd:disas()

terra fadd()
  var f : double = 1.0

  atomic_fadd(&f, 20.0)

  return f
end

print(fadd())
assert(fadd() == 21.0)

shoe42 commented 2 years ago

Looks like it outputs 21 as expected:

$ ../build/bin/terra bug603_2.t
bug603_2.t:1:           terra atomic_fadd(x : &double,y : double) : {}
bug603_2.t:2:               atomicrmw("fadd", x, y, { syncscope = "", ordering = "monotonic", align = native, isvolatile = false })
bug603_2.t:1:           end
definition  {&double,double} -> {}
define dso_local void @"$atomic_fadd"(double* %0, double %1) {
entry:
  %2 = atomicrmw fadd double* %0, double %1 monotonic, align 8
  ret void
}
assembly for function at address 0x10bf7c000
0x10bf7c000(+0):        ldr d1, [x0]
0x10bf7c004(+4):        b   #16
0x10bf7c008(+8):        fmov    d1, x10
0x10bf7c00c(+12):       cmp x10, x9
0x10bf7c010(+16):       b.eq    #40
0x10bf7c014(+20):       fadd    d2, d1, d0
0x10bf7c018(+24):       fmov    x8, d2
0x10bf7c01c(+28):       fmov    x9, d1
0x10bf7c020(+32):       ldaxr   x10, [x0]
0x10bf7c024(+36):       cmp x10, x9
0x10bf7c028(+40):       b.ne    #-32
0x10bf7c02c(+44):       stlxr   wzr, x8, [x0]
0x10bf7c030(+48):       cbnz    wzr, #-16
0x10bf7c034(+52):       b   #-44
0x10bf7c038(+56):       ret
21

elliottslaughter commented 2 years ago

Ok, there's a typo in the test. That explains why I'm so confused.

Try this one:

terra atomic_xchg_pointer(x : &&int, y : &int)
  return terralib.atomicrmw("xchg", x, y, {ordering = "acq_rel"})
end
atomic_xchg_pointer:printpretty(false)
atomic_xchg_pointer:disas()

terra xchg_pointer()
  var i : int = 1
  var j : int = 20
  var k = &i

  var r = atomic_xchg_pointer(&k, &j)

  return r == &i and k == &j
end

print(xchg_pointer())
assert(xchg_pointer())

shoe42 commented 2 years ago

bug603.t:1:             terra atomic_xchg_pointer(x : &&int32,y : &int32) : &int32
bug603.t:2:                 return atomicrmw("xchg", x, y, { syncscope = "", ordering = "acq_rel", align = native, isvolatile = false })
bug603.t:1:             end
definition  {&&int32,&int32} -> &int32
LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.stlxr
zsh: abort      ../build/bin/terra bug603.t

elliottslaughter commented 2 years ago

Let's try running this code through Clang. This is as close as I can get to the equivalent C code, and if it works maybe we can teach Terra to emit the equivalent LLVM IR.

bug603.c:

#include <assert.h>

int *atomic_xchg_pointer(int **x, int *y) {
  int *result;
  __atomic_exchange(x, &y, &result, __ATOMIC_ACQ_REL);
  return result;
}

int xchg_pointer() {
  int i = 1;
  int j = 20;
  int *k = &i;

  int *r = atomic_xchg_pointer(&k, &j);

  return r == &i && k == &j;
}

int main() {
  assert(xchg_pointer());
  return 0;
}

clang bug603.c -o bug603 -Wall
./bug603
clang bug603.c -o bug603.ll -S -emit-llvm -Wall

And attach the final .ll file here so we can see what Clang is generating (assuming it compiles at all).

elliottslaughter commented 2 years ago

Also please run this through Terra and let's see what Terra is generating:

bug603.t:

terra atomic_xchg_pointer(x : &&int, y : &int)
  return terralib.atomicrmw("xchg", x, y, {ordering = "acq_rel"})
end

terra xchg_pointer()
  var i : int = 1
  var j : int = 20
  var k = &i

  var r = atomic_xchg_pointer(&k, &j)

  return r == &i and k == &j
end
print(terralib.saveobj(nil, "llvmir", {xchg_pointer=xchg_pointer}, nil, nil, false))

print(xchg_pointer())
assert(xchg_pointer())

shoe42 commented 2 years ago

Here's the IR from bug603.c:

; ModuleID = 'bug603.c'
source_filename = "bug603.c"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-macosx12.0.0"

@__func__.main = private unnamed_addr constant [5 x i8] c"main\00", align 1
@.str = private unnamed_addr constant [9 x i8] c"bug603.c\00", align 1
@.str.1 = private unnamed_addr constant [15 x i8] c"xchg_pointer()\00", align 1

; Function Attrs: noinline nounwind optnone ssp uwtable
define i32* @atomic_xchg_pointer(i32** %0, i32* %1) #0 {
  %3 = alloca i32**, align 8
  %4 = alloca i32*, align 8
  %5 = alloca i32*, align 8
  store i32** %0, i32*** %3, align 8
  store i32* %1, i32** %4, align 8
  %6 = load i32**, i32*** %3, align 8
  %7 = bitcast i32** %6 to i64*
  %8 = bitcast i32** %4 to i64*
  %9 = bitcast i32** %5 to i64*
  %10 = load i64, i64* %8, align 8
  %11 = atomicrmw xchg i64* %7, i64 %10 acq_rel, align 8
  store i64 %11, i64* %9, align 8
  %12 = load i32*, i32** %5, align 8
  ret i32* %12
}

; Function Attrs: noinline nounwind optnone ssp uwtable
define i32 @xchg_pointer() #0 {
  %1 = alloca i32, align 4
  %2 = alloca i32, align 4
  %3 = alloca i32*, align 8
  %4 = alloca i32*, align 8
  store i32 1, i32* %1, align 4
  store i32 20, i32* %2, align 4
  store i32* %1, i32** %3, align 8
  %5 = call i32* @atomic_xchg_pointer(i32** %3, i32* %2)
  store i32* %5, i32** %4, align 8
  %6 = load i32*, i32** %4, align 8
  %7 = icmp eq i32* %6, %1
  br i1 %7, label %8, label %11

8:                                                ; preds = %0
  %9 = load i32*, i32** %3, align 8
  %10 = icmp eq i32* %9, %2
  br label %11

11:                                               ; preds = %8, %0
  %12 = phi i1 [ false, %0 ], [ %10, %8 ]
  %13 = zext i1 %12 to i32
  ret i32 %13
}

; Function Attrs: noinline nounwind optnone ssp uwtable
define i32 @main() #0 {
  %1 = alloca i32, align 4
  store i32 0, i32* %1, align 4
  %2 = call i32 @xchg_pointer()
  %3 = icmp ne i32 %2, 0
  %4 = xor i1 %3, true
  %5 = zext i1 %4 to i32
  %6 = sext i32 %5 to i64
  %7 = icmp ne i64 %6, 0
  br i1 %7, label %8, label %10

8:                                                ; preds = %0
  call void @__assert_rtn(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @__func__.main, i64 0, i64 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i64 0, i64 0), i32 20, i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.1, i64 0, i64 0)) #2
  unreachable

9:                                                ; No predecessors!
  br label %11

10:                                               ; preds = %0
  br label %11

11:                                               ; preds = %10, %9
  ret i32 0
}

; Function Attrs: cold noreturn
declare void @__assert_rtn(i8*, i8*, i32, i8*) #1

attributes #0 = { noinline nounwind optnone ssp uwtable "frame-pointer"="non-leaf" "min-legal-vector-width"="0" "no-trapping-math"="true" "probe-stack"="__chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+crc,+crypto,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+v8.5a,+zcm,+zcz" }
attributes #1 = { cold noreturn "disable-tail-calls"="true" "frame-pointer"="non-leaf" "no-trapping-math"="true" "probe-stack"="__chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+crc,+crypto,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+v8.5a,+zcm,+zcz" }
attributes #2 = { cold noreturn }

!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
!llvm.ident = !{!9}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 12, i32 3]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 1, !"branch-target-enforcement", i32 0}
!3 = !{i32 1, !"sign-return-address", i32 0}
!4 = !{i32 1, !"sign-return-address-all", i32 0}
!5 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
!6 = !{i32 7, !"PIC Level", i32 2}
!7 = !{i32 7, !"uwtable", i32 1}
!8 = !{i32 7, !"frame-pointer", i32 1}
!9 = !{!"Apple clang version 13.1.6 (clang-1316.0.21.2.5)"}

And from bug603.t:

; ModuleID = 'terra'
source_filename = "terra"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-darwin21.6.0"

define dso_local i1 @xchg_pointer() {
entry:
  %r = alloca i32*, align 8
  %k = alloca i32*, align 8
  %j = alloca i32, align 4
  %i = alloca i32, align 4
  store i32 1, i32* %i, align 4
  store i32 20, i32* %j, align 4
  store i32* %i, i32** %k, align 8
  %0 = call i32* @"$atomic_xchg_pointer"(i32** %k, i32* %j)
  store i32* %0, i32** %r, align 8
  %1 = load i32*, i32** %r, align 8
  %2 = icmp eq i32* %1, %i
  %3 = zext i1 %2 to i8
  %4 = trunc i8 %3 to i1
  br i1 %4, label %and.rhs, label %and.end

and.rhs:                                          ; preds = %entry
  %5 = load i32*, i32** %k, align 8
  %6 = icmp eq i32* %5, %j
  %7 = zext i1 %6 to i8
  %8 = trunc i8 %7 to i1
  br label %and.end

and.end:                                          ; preds = %and.rhs, %entry
  %9 = phi i1 [ false, %entry ], [ %8, %and.rhs ]
  %10 = zext i1 %9 to i8
  %11 = icmp ne i8 %10, 0
  ret i1 %11

dead:                                             ; No predecessors!
  ret i1 undef
}

define internal i32* @"$atomic_xchg_pointer"(i32** %0, i32* %1) {
entry:
  %y = alloca i32*, align 8
  %x = alloca i32**, align 8
  store i32** %0, i32*** %x, align 8
  store i32* %1, i32** %y, align 8
  %2 = load i32**, i32*** %x, align 8
  %3 = load i32*, i32** %y, align 8
  %4 = atomicrmw xchg i32** %2, i32* %3 acq_rel, align 8
  ret i32* %4

dead:                                             ; No predecessors!
  ret i32* undef
}

Still with the following error:

LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.stlxr
zsh: abort      ../build/bin/terra bug603.t

elliottslaughter commented 2 years ago

Ok, so it looks like the C compiler treats a pointer as a int64 and then does an atomic on that.

I'm on the fence about whether Terra should make any effort to deal with this situation or not. int64 atomics are already available to the user. The atomicrmw is a pretty low level interface to LLVM and so I'm inclined to say you should get what you ask for, even if LLVM is broken in some combinations.

elliottslaughter commented 2 years ago

I was playing with this again, and I noticed that running the LLVM IR through llc results in this error:

llc: error: .../llc: bug603t_x86.ll:48:33: error: atomicrmw xchg operand must be an integer or floating point type
  %4 = atomicrmw xchg i32** %2, i32* %3 acq_rel, align 8
                                ^

This seems inconsistent with the documentation, but I guess the documentation is just wrong. Oh well.

I'll put in a change to disallow pointers in xchg since it is apparently not supported by LLVM.

elliottslaughter commented 2 years ago

@shoe42 Can you try again on master? I think all of these bugs should be fixed or worked around at this point.

shoe42 commented 2 years ago

Seems to be fixed - only cconv_array.t fails now as of 71f2180cbf252f615c1fb7b2d7dd0746581b0741.

terralang / terra

macOS arm64 test failure: atomicrmw.t #603