Closed laytan closed 7 months ago
Small reproduction (the assertion fails):
package main
main :: proc() {
y := f16(11)
x := f16(11)
assert(y == x)
// assert(bad(), "no")
}
// bad :: proc "contextless" () -> bool {
// y := f16(11)
// x := f16(11)
// return x == y
// }
_main.bad:
.cfi_startproc
subq $24, %rsp
.cfi_def_cfa_offset 32
pinsrw $0, LCPI32_0(%rip), %xmm0
pextrw $0, %xmm0, 22(%rsp)
pextrw $0, %xmm0, 20(%rsp)
pinsrw $0, 20(%rsp), %xmm0
movaps %xmm0, %xmm1
pextrw $0, %xmm1, %eax
movw %ax, 14(%rsp)
movzwl 22(%rsp), %edi
callq ___extendhfsf2
movw 14(%rsp), %ax
movss %xmm0, 16(%rsp)
movzwl %ax, %edi
callq ___extendhfsf2
movss 16(%rsp), %xmm1
cmpeqss %xmm1, %xmm0
movd %xmm0, %eax
andl $1, %eax
andb $1, %al
addq $24, %rsp
retq
.cfi_endproc
define internal i8 @main.bad() {
decls:
%y = alloca half, align 2
%x = alloca half, align 2
br label %entry
entry: ; preds = %decls
store half 0xH4980, ptr %y, align 2
store half 0xH4980, ptr %x, align 2
%0 = load half, ptr %x, align 2
%1 = load half, ptr %y, align 2
%2 = fcmp oeq half %0, %1
%3 = zext i1 %2 to i8
ret i8 %3
}
C equivalent (clang --target=x86_64-apple-macosx -O0 -S -emit-llvm test.c
):
#include <assert.h>
#include <stdbool.h>
bool bad()
{
__fp16 x = 11;
__fp16 y = 11;
return x == y;
}
int main(int argc, char *argv[])
{
assert(bad());
return 0;
}
_bad: ## @bad
.cfi_startproc
## %bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
subq $16, %rsp
## implicit-def: $xmm0
pinsrw $0, LCPI0_0(%rip), %xmm0
pextrw $0, %xmm0, -2(%rbp)
pextrw $0, %xmm0, -4(%rbp)
movzwl -2(%rbp), %edi
callq ___extendhfsf2
movss %xmm0, -8(%rbp) ## 4-byte Spill
movzwl -4(%rbp), %edi
callq ___extendhfsf2
movaps %xmm0, %xmm1
movss -8(%rbp), %xmm0 ## 4-byte Reload
## xmm0 = mem[0],zero,zero,zero
ucomiss %xmm1, %xmm0
sete %al
setnp %cl
andb %cl, %al
andb $1, %al
movzbl %al, %eax
addq $16, %rsp
popq %rbp
retq
.cfi_endproc
; Function Attrs: noinline nounwind optnone ssp uwtable
define zeroext i1 @bad() #0 {
%1 = alloca half, align 2
%2 = alloca half, align 2
store half 0xH4980, ptr %1, align 2
store half 0xH4980, ptr %2, align 2
%3 = load half, ptr %1, align 2
%4 = fpext half %3 to float
%5 = load half, ptr %2, align 2
%6 = fpext half %5 to float
%7 = fcmp oeq float %4, %6
ret i1 %7
}
I thought it might have been the fpext that clang was doing, but that seems to not fix it. I applied this patch:
diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp
index 12949f0ab..ae403b29d 100644
--- a/src/llvm_backend_expr.cpp
+++ b/src/llvm_backend_expr.cpp
@@ -2694,7 +2694,15 @@ gb_internal lbValue lb_emit_comp(lbProcedure *p, TokenKind op_kind, lbValue left
case Token_LtEq: pred = LLVMRealOLE; break;
case Token_NotEq: pred = LLVMRealONE; break;
}
- res.value = LLVMBuildFCmp(p->builder, pred, left.value, right.value, "");
+
+ if (are_types_identical(a, t_f16)) {
+ GB_ASSERT(are_types_identical(a, b));
+ LLVMValueRef lext = LLVMBuildFPExt(p->builder, left.value, LLVMFloatType(), "");
+ LLVMValueRef rext = LLVMBuildFPExt(p->builder, right.value, LLVMFloatType(), "");
+ res.value = LLVMBuildFCmp(p->builder, pred, lext, rext, "");
+ } else {
+ res.value = LLVMBuildFCmp(p->builder, pred, left.value, right.value, "");
+ }
} else if (is_type_typeid(a)) {
LLVMIntPredicate pred = {};
switch (op_kind) {
Which produces this llvm:
define internal i8 @main.bad() {
decls:
%y = alloca half, align 2
%x = alloca half, align 2
br label %entry
entry: ; preds = %decls
store half 0xH4980, ptr %y, align 2
store half 0xH4980, ptr %x, align 2
%0 = load half, ptr %x, align 2
%1 = load half, ptr %y, align 2
%2 = fpext half %0 to float
%3 = fpext half %1 to float
%4 = fcmp oeq float %2, %3
%5 = zext i1 %4 to i8
ret i8 %5
}
and this asm:
_main.bad:
.cfi_startproc
subq $24, %rsp
.cfi_def_cfa_offset 32
pinsrw $0, LCPI32_0(%rip), %xmm0
pextrw $0, %xmm0, 22(%rsp)
pextrw $0, %xmm0, 20(%rsp)
movzwl 20(%rsp), %edi
movzwl 22(%rsp), %eax
movl %eax, 12(%rsp)
callq ___extendhfsf2
movl 12(%rsp), %edi
movss %xmm0, 16(%rsp)
callq ___extendhfsf2
movaps %xmm0, %xmm1
movss 16(%rsp), %xmm0
ucomiss %xmm1, %xmm0
sete %al
setnp %cl
andb %cl, %al
andb $1, %al
addq $24, %rsp
retq
.cfi_endproc
Setting __ODIN_LLVM_F16_SUPPORTED
to false
instead of true
(what it currently gets set to) seems to "fix" it.
Looks like that changes the implementation of our builtin __extendhfsf2 to use the non-float registers for the calls and that was what llvm expects.
Expected is the half/f16 as an argument through edi
and returning the float/f32 through xmm0
@gingerBill do you think we should set that to false for darwin_amd64
? Or do you think there might be an underlying other issue here?
Interesting llvm issue regarding this: https://github.com/llvm/llvm-project/issues/56854
Context
In my effort to update the CI to LLVM 17 (in #3124) I came across these test failures regarding f16 truncations.
See the runner logs here: https://github.com/odin-lang/Odin/actions/runs/7996079532/job/21837831886?pr=3124