f16 comparisons fail on Intel MacOS + LLVM17

laytan commented 8 months ago

Context

In my effort to update the CI to LLVM 17 (in #3124) I came across these test failures regarding f16 truncations.

See the runner logs here: https://github.com/odin-lang/Odin/actions/runs/7996079532/job/21837831886?pr=3124

laytan commented 7 months ago

Small reproduction (the assertion fails):

package main

main :: proc() {
    y := f16(11)
    x := f16(11)
    assert(y == x)
  // assert(bad(), "no")
}

// bad :: proc "contextless" () -> bool {
//  y := f16(11)
//  x := f16(11)
//  return x == y
// }

_main.bad:
    .cfi_startproc
    subq    $24, %rsp
    .cfi_def_cfa_offset 32
    pinsrw  $0, LCPI32_0(%rip), %xmm0
    pextrw  $0, %xmm0, 22(%rsp)
    pextrw  $0, %xmm0, 20(%rsp)
    pinsrw  $0, 20(%rsp), %xmm0
    movaps  %xmm0, %xmm1
    pextrw  $0, %xmm1, %eax
    movw    %ax, 14(%rsp)
    movzwl  22(%rsp), %edi
    callq   ___extendhfsf2
    movw    14(%rsp), %ax
    movss   %xmm0, 16(%rsp)
    movzwl  %ax, %edi
    callq   ___extendhfsf2
    movss   16(%rsp), %xmm1
    cmpeqss %xmm1, %xmm0
    movd    %xmm0, %eax
    andl    $1, %eax
    andb    $1, %al
    addq    $24, %rsp
    retq
    .cfi_endproc

define internal i8 @main.bad() {
decls:
  %y = alloca half, align 2
  %x = alloca half, align 2
  br label %entry

entry:                                            ; preds = %decls
  store half 0xH4980, ptr %y, align 2
  store half 0xH4980, ptr %x, align 2
  %0 = load half, ptr %x, align 2
  %1 = load half, ptr %y, align 2
  %2 = fcmp oeq half %0, %1
  %3 = zext i1 %2 to i8
  ret i8 %3
}

laytan commented 7 months ago

C equivalent (clang --target=x86_64-apple-macosx -O0 -S -emit-llvm test.c):

#include <assert.h>
#include <stdbool.h>

bool bad()
{
    __fp16 x = 11;
    __fp16 y = 11;
    return x == y;
}

int main(int argc, char *argv[])
{
    assert(bad());
    return 0;
}

_bad:                                   ## @bad
    .cfi_startproc
## %bb.0:
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register %rbp
    subq    $16, %rsp
                                        ## implicit-def: $xmm0
    pinsrw  $0, LCPI0_0(%rip), %xmm0
    pextrw  $0, %xmm0, -2(%rbp)
    pextrw  $0, %xmm0, -4(%rbp)
    movzwl  -2(%rbp), %edi
    callq   ___extendhfsf2
    movss   %xmm0, -8(%rbp)                 ## 4-byte Spill
    movzwl  -4(%rbp), %edi
    callq   ___extendhfsf2
    movaps  %xmm0, %xmm1
    movss   -8(%rbp), %xmm0                 ## 4-byte Reload
                                        ## xmm0 = mem[0],zero,zero,zero
    ucomiss %xmm1, %xmm0
    sete    %al
    setnp   %cl
    andb    %cl, %al
    andb    $1, %al
    movzbl  %al, %eax
    addq    $16, %rsp
    popq    %rbp
    retq
    .cfi_endproc

; Function Attrs: noinline nounwind optnone ssp uwtable
define zeroext i1 @bad() #0 {
  %1 = alloca half, align 2
  %2 = alloca half, align 2
  store half 0xH4980, ptr %1, align 2
  store half 0xH4980, ptr %2, align 2
  %3 = load half, ptr %1, align 2
  %4 = fpext half %3 to float
  %5 = load half, ptr %2, align 2
  %6 = fpext half %5 to float
  %7 = fcmp oeq float %4, %6
  ret i1 %7
}

laytan commented 7 months ago

I thought it might have been the fpext that clang was doing, but that seems to not fix it. I applied this patch:

diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp
index 12949f0ab..ae403b29d 100644
--- a/src/llvm_backend_expr.cpp
+++ b/src/llvm_backend_expr.cpp
@@ -2694,7 +2694,15 @@ gb_internal lbValue lb_emit_comp(lbProcedure *p, TokenKind op_kind, lbValue left
        case Token_LtEq:  pred = LLVMRealOLE; break;
        case Token_NotEq: pred = LLVMRealONE; break;
        }
-       res.value = LLVMBuildFCmp(p->builder, pred, left.value, right.value, "");
+
+       if (are_types_identical(a, t_f16)) {
+           GB_ASSERT(are_types_identical(a, b));
+           LLVMValueRef lext = LLVMBuildFPExt(p->builder, left.value, LLVMFloatType(), "");
+           LLVMValueRef rext = LLVMBuildFPExt(p->builder, right.value, LLVMFloatType(), "");
+           res.value = LLVMBuildFCmp(p->builder, pred, lext, rext, "");
+       } else {
+           res.value = LLVMBuildFCmp(p->builder, pred, left.value, right.value, "");
+       }
    } else if (is_type_typeid(a)) {
        LLVMIntPredicate pred = {};
        switch (op_kind) {

Which produces this llvm:

define internal i8 @main.bad() {
decls:
  %y = alloca half, align 2
  %x = alloca half, align 2
  br label %entry

entry:                                            ; preds = %decls
  store half 0xH4980, ptr %y, align 2
  store half 0xH4980, ptr %x, align 2
  %0 = load half, ptr %x, align 2
  %1 = load half, ptr %y, align 2
  %2 = fpext half %0 to float
  %3 = fpext half %1 to float
  %4 = fcmp oeq float %2, %3
  %5 = zext i1 %4 to i8
  ret i8 %5
}

and this asm:

_main.bad:
    .cfi_startproc
    subq    $24, %rsp
    .cfi_def_cfa_offset 32
    pinsrw  $0, LCPI32_0(%rip), %xmm0
    pextrw  $0, %xmm0, 22(%rsp)
    pextrw  $0, %xmm0, 20(%rsp)
    movzwl  20(%rsp), %edi
    movzwl  22(%rsp), %eax
    movl    %eax, 12(%rsp)
    callq   ___extendhfsf2
    movl    12(%rsp), %edi
    movss   %xmm0, 16(%rsp)
    callq   ___extendhfsf2
    movaps  %xmm0, %xmm1
    movss   16(%rsp), %xmm0
    ucomiss %xmm1, %xmm0
    sete    %al
    setnp   %cl
    andb    %cl, %al
    andb    $1, %al
    addq    $24, %rsp
    retq
    .cfi_endproc

laytan commented 7 months ago

Setting __ODIN_LLVM_F16_SUPPORTED to false instead of true (what it currently gets set to) seems to "fix" it.

laytan commented 7 months ago

Looks like that changes the implementation of our builtin __extendhfsf2 to use the non-float registers for the calls and that was what llvm expects.

Expected is the half/f16 as an argument through edi and returning the float/f32 through xmm0

laytan commented 7 months ago

@gingerBill do you think we should set that to false for darwin_amd64? Or do you think there might be an underlying other issue here?

laytan commented 7 months ago

Interesting llvm issue regarding this: https://github.com/llvm/llvm-project/issues/56854

odin-lang / Odin

f16 comparisons fail on Intel MacOS + LLVM17 #3222

Context