Open tgross35 opened 4 months ago
cc @ecnelises
@rustbot label +F-f16_and_f128 +A-ABI +T-libs +C-bug -needs-triage
This is being caused by a calling convention mismatch; the Rust version passes/returns the f128
s in GPRs, but the builtin expects them to be in vector registers. This is because the ABI of f128
on PowerPC depends on whether the vsx
target feature has been enabled (the Rust target has it disabled by default): GCC gets around this by not supporting _Float128
when the vsx
target feature is disabled.
That is interesting. Any idea what is best to do here?
This is the equivalent of #116344 but for PowerPC; LLVM will generate builtins calls using whatever target features are enabled for the current function. For another example (compiler explorer)
#![feature(f16, f128)]
#[inline(never)]
pub fn soft_float(x: f128, dest: &mut f16) {
*dest = x as f16;
}
#[inline(never)]
#[target_feature(enable = "sse2")]
pub unsafe fn hard_float(x: f128, dest: &mut f16) {
*dest = x as f16;
}
When the above code is compiled for the i586-unknown-linux-gnu
target, both functions will call the __trunctfhf2
builtin but the soft_float
function will expect the result to be returned in the ax
register and the hard_float
function will expect the result to be returned in the xmm0
register.
The possible solutions I can think of are:
compiler-builtins
was used instead. We'd also have to prohibit linking with other Rust or C code that was compiled with the target feature in a different state, as that code would want to use the builtins with a different ABI. This prohibition would be extremely difficult to achieve if the system libc uses the relevant builtins (and was built with different relevant target features), although this is unlikely to be an issue for f16
and f128
.compiler-builtins
and prohibit linking like in option 1.Option 4 is probably the best solution here. Option 2/3 will probably be required for things like #116558, but that's a separate (but closely related) issue.
For some reason, the
__addkf3
symbol linked by rustc says that0x00000000000000000000000000000000 + 0x00000000000000000000000000000001 = 0x00000000000000000000000000000000
, when the answer should be0x00000000000000000000000000000001
. Other symbols likely do the wrong thing here too.Example:
Our Rust code emits
__addkf3
which calls the hardware f128 addition routinexsaddup
. The part I cannot explain is that the C version also emits__addkf3
which also callsxsaddup
, but somehow produces a correct result. I checked with GDB to make sure they are both hittingxsaddup
, so can't really explain the discrepancy.Original notes at https://github.com/rust-lang/compiler-builtins/pull/606#issuecomment-2105657287, discussion on Zulip at https://rust-lang.zulipchat.com/#narrow/stream/122651-general/topic/f128.20system.20libraries.20noncompliant.20platforms/near/438486364.
Full code copied from https://github.com/rust-lang/compiler-builtins/pull/606#issuecomment-2106197914:
Rust code
```rust #![feature(f128)] #[no_mangle] #[inline(never)] fn add_entry(a: f128, b: f128) -> f128 { a + b } fn main() { let a = f128::from_bits(0x0); let b = f128::from_bits(0x1); dbg!(a, b); let c = add_entry(a, b); dbg!(c); } ```C code
```c #includeAssembly generated from Rust (incorrect result)
```asm 0000000000010d00 <.add_entry>: 10d00: 7c 08 02 a6 mflr r0 10d04: f8 21 ff 91 stdu r1,-112(r1) 10d08: f8 01 00 80 std r0,128(r1) 10d0c: 4b ff c6 95 bl d3a0 <00000143.plt_call.__addkf3> 10d10: e8 41 00 28 ld r2,40(r1) 10d14: 38 21 00 70 addi r1,r1,112 10d18: e8 01 00 10 ld r0,16(r1) 10d1c: 7c 08 03 a6 mtlr r0 10d20: 4e 80 00 20 blr 000000000000d3a0 <00000143.plt_call.__addkf3>: d3a0: f8 41 00 28 std r2,40(r1) d3a4: 3d 62 ff ff addis r11,r2,-1 d3a8: e9 8b 7f 58 ld r12,32600(r11) d3ac: 7d 89 03 a6 mtctr r12 d3b0: e8 4b 7f 60 ld r2,32608(r11) d3b4: 4e 80 04 20 bctr 0000000000056790 <.__addkf3_resolve>: 56790: 81 2d 8f 9c lwz r9,-28772(r13) 56794: 75 29 00 40 andis. r9,r9,64 56798: 41 82 00 18 beq 567b0 <.__addkf3_resolve+0x20> 5679c: e8 62 80 10 ld r3,-32752(r2) 567a0: 4e 80 00 20 blr 567a4: 60 00 00 00 nop 567a8: 60 00 00 00 nop 567ac: 60 42 00 00 ori r2,r2,0 567b0: e8 62 80 18 ld r3,-32744(r2) 567b4: 4e 80 00 20 blr ... 567c4: 60 00 00 00 nop 567c8: 60 00 00 00 nop 567cc: 60 42 00 00 ori r2,r2,0 000000000005e6e0 <.__addkf3_hw>: 5e6e0: fc 42 18 08 xsaddqp v2,v2,v3 5e6e4: 4e 80 00 20 blr ... 5e6f4: 60 00 00 00 nop 5e6f8: 60 00 00 00 nop 5e6fc: 60 00 00 00 nop 0000000000057050 <.__addkf3_sw>: 57050: fb 41 ff d0 std r26,-48(r1) 57054: fb 61 ff d8 std r27,-40(r1) 57058: fb 81 ff e0 std r28,-32(r1) 5705c: fb a1 ff e8 std r29,-24(r1) 57060: fb c1 ff f0 std r30,-16(r1) 57064: fb e1 ff f8 std r31,-8(r1) 57068: f8 21 ff 41 stdu r1,-192(r1) 5706c: 39 21 00 70 addi r9,r1,112 57070: 39 41 00 70 addi r10,r1,112 // ... full sw implementation ```Assembly generated from C (correct result)
```asm 0000000010000af8 <.add_entry>: 10000af8: 7c 08 02 a6 mflr r0 10000afc: f8 01 00 10 std r0,16(r1) 10000b00: fb e1 ff f8 std r31,-8(r1) 10000b04: f8 21 ff 81 stdu r1,-128(r1) 10000b08: 7c 3f 0b 78 mr r31,r1 10000b0c: 39 20 00 30 li r9,48 10000b10: 39 5f 00 80 addi r10,r31,128 10000b14: 7c 4a 4f 99 stxvd2x vs34,r10,r9 10000b18: 39 20 00 40 li r9,64 10000b1c: 39 5f 00 80 addi r10,r31,128 10000b20: 7c 6a 4f 99 stxvd2x vs35,r10,r9 10000b24: 39 20 00 40 li r9,64 10000b28: 39 5f 00 80 addi r10,r31,128 10000b2c: 7c 6a 4e 99 lxvd2x vs35,r10,r9 10000b30: 39 20 00 30 li r9,48 10000b34: 39 5f 00 80 addi r10,r31,128 10000b38: 7c 4a 4e 99 lxvd2x vs34,r10,r9 10000b3c: 4b ff fb a5 bl 100006e0 <00000019.plt_call.__addkf3> 10000b40: e8 41 00 28 ld r2,40(r1) 10000b44: f0 02 14 96 xxmr vs0,vs34 10000b48: f0 40 04 91 xxmr vs34,vs0 10000b4c: 38 3f 00 80 addi r1,r31,128 10000b50: e8 01 00 10 ld r0,16(r1) 10000b54: 7c 08 03 a6 mtlr r0 10000b58: eb e1 ff f8 ld r31,-8(r1) 10000b5c: 4e 80 00 20 blr 10000b60: 00 00 00 00 .long 0x0 10000b64: 00 00 00 01 .long 0x1 10000b68: 80 01 00 01 lwz r0,1(r1) 0000000010000c40 <.__addkf3_resolve>: 10000c40: 81 2d 8f 9c lwz r9,-28772(r13) 10000c44: 75 29 00 40 andis. r9,r9,64 10000c48: 41 82 00 18 beq 10000c60 <.__addkf3_resolve+0x20> 10000c4c: e8 62 80 10 ld r3,-32752(r2) 10000c50: 4e 80 00 20 blr 10000c54: 60 00 00 00 nop 10000c58: 60 00 00 00 nop 10000c5c: 60 42 00 00 ori r2,r2,0 10000c60: e8 62 80 18 ld r3,-32744(r2) 10000c64: 4e 80 00 20 blr ... 10000c74: 60 00 00 00 nop 10000c78: 60 00 00 00 nop 10000c7c: 60 42 00 00 ori r2,r2,0 0000000010008b90 <.__addkf3_hw>: 10008b90: fc 42 18 08 xsaddqp v2,v2,v3 10008b94: 4e 80 00 20 blr ... 10008ba4: 60 00 00 00 nop 10008ba8: 60 00 00 00 nop 10008bac: 60 00 00 00 nop 0000000010001500 <.__addkf3_sw>: 10001500: fb 41 ff d0 std r26,-48(r1) 10001504: fb 61 ff d8 std r27,-40(r1) 10001508: fb 81 ff e0 std r28,-32(r1) 1000150c: fb a1 ff e8 std r29,-24(r1) 10001510: fb c1 ff f0 std r30,-16(r1) 10001514: fb e1 ff f8 std r31,-8(r1) 10001518: f8 21 ff 41 stdu r1,-192(r1) 1000151c: 39 21 00 70 addi r9,r1,112 10001520: 39 41 00 70 addi r10,r1,112 // ... ```