Alexhuszagh commented 3 years ago

Issue

Float::from_bits is implemented as mem::transmute, and therefore does exactly what we need in all cases.

Solution

We don't need an endian-dependent code-path: we can just use the mask 0xFFFFFFFF to ensure it works, which will grab the least-significant 32 bits, and then create a float directly from those bits.

pub unsafe fn to_float_v1(word: u64) -> f32 {
    if cfg!(target_endian = "big") {
        *(&word as *const _ as *const f32).add(1)
    } else {
        *(&word as *const _ as *const f32)
    }
}

pub fn to_float_v2(word: u64) -> f32 {
    f32::from_bits((word & 0xFFFFFFFF) as u32)
}

example::to_float_v1:
        movd    xmm0, edi
        ret

example::to_float_v2:
        movd    xmm0, edi
        ret

The full implementation therefore could be:

diff --git a/src/parse.rs b/src/parse.rs
index 5571da7..9c592d4 100644
--- a/src/parse.rs
+++ b/src/parse.rs
@@ -1,5 +1,3 @@
-use core::mem;
-
 use crate::binary::compute_float;
 use crate::float::Float;
 use crate::number::{parse_inf_nan, parse_number};
@@ -32,13 +30,5 @@ pub fn parse_float<F: Float>(s: &[u8]) -> Option<(F, usize)> {
     if num.negative {
         word |= 1_u64 << F::SIGN_INDEX;
     }
-    let value = unsafe {
-        if cfg!(target_endian = "big") && mem::size_of::<F>() == 4 {
-            *(&word as *const _ as *const F).add(1)
-        } else {
-            *(&word as *const _ as *const F)
-        }
-    };
-
-    Some((value, rest))
+    Some((F::from_u64_bits(word), rest))
 }
diff --git a/src/float.rs b/src/float.rs
index 39bec41..a976408 100644
--- a/src/float.rs
+++ b/src/float.rs
@@ -40,6 +40,7 @@ pub trait Float:
     const MAX_MANTISSA_FAST_PATH: u64 = 2_u64 << Self::MANTISSA_EXPLICIT_BITS;

     fn from_u64(v: u64) -> Self;
+    fn from_u64_bits(v: u64) -> Self;
     fn pow10_fast_path(exponent: usize) -> Self;
 }

@@ -67,6 +68,11 @@ impl Float for f32 {
         v as _
     }

+    #[inline]
+    fn from_u64_bits(v: u64) -> Self {
+        f32::from_bits((v & 0xFFFFFFFF) as u32)
+    }
+
     #[inline]
     fn pow10_fast_path(exponent: usize) -> Self {
         #[allow(clippy::use_self)]
@@ -101,6 +107,11 @@ impl Float for f64 {
         v as _
     }

+    #[inline]
+    fn from_u64_bits(v: u64) -> Self {
+        f64::from_bits(v)
+    }
+
     #[inline]
     fn pow10_fast_path(exponent: usize) -> Self {
         #[allow(clippy::use_self)]

This was tested on both little-endian and big-endian platforms using the following cross targets:

powerpc-unknown-linux-gnu
x86_64-unknown-linux-gnu

Alexhuszagh commented 3 years ago

Using the following command, we can see the assembly generated for a few architectures for both functions:

Command

cross rustc --target $target --release -- --emit asm

Functions

#[no_mangle]
pub unsafe fn to_float_v1(word: u64) -> f32 {
    if cfg!(target_endian = "big") {
        *(&word as *const _ as *const f32).add(1)
    } else {
        *(&word as *const _ as *const f32)
    }
}

#[no_mangle]
pub unsafe fn to_float_v2(word: u64) -> f32 {
    f32::from_bits((word & 0xFFFFFFFF) as u32)
}

Assembly

i686-unknown-linux-gnu

to_float_v1:
    .cfi_startproc
    flds    4(%esp)
    retl

.set to_float_v2, to_float_v1

x86_64-unknown-linux-gnu

to_float_v1:
    .cfi_startproc
    movd    %edi, %xmm0
    retq

.set to_float_v2, to_float_v1

powerpc-unknown-linux-gnu

to_float_v1:
.Lfunc_begin5:
    .cfi_startproc
    stwu 1, -16(1)
    .cfi_def_cfa_offset 16
    stw 4, 12(1)
    lfs 1, 12(1)
    addi 1, 1, 16
    blr

.set to_float_v2, to_float_v1

mips64-unknown-linux-gnuabi64

to_float_v1:
    .cfi_startproc
    .frame  $sp,0,$ra
    .mask   0x00000000,0
    .fmask  0x00000000,0
    .set    noreorder
    .set    nomacro
    .set    noat
    sll $1, $4, 0
    jr  $ra
    mtc1    $1, $f0
    .set    at
    .set    macro
    .set    reorder
    .end    to_float_v1

.set to_float_v2, to_float_v1

In short, the ASM is identical in every case, big and little-endian alike.

aldanor commented 3 years ago

Looks great, thanks for the detailed analysis.

Perhaps we should have at least one BE target to run the tests on CI?

Alexhuszagh commented 3 years ago

I can add in a few targets. lexical-core tests on a lot of targets, and some of the good smoke signals are MIPS and PowerPC (for 64-bit and 32-bit).

The sample targets I'd use are one of the following:

    - env: TARGET=mips-unknown-linux-gnu DISABLE_BENCHES=1 NO_FEATURES=1
    - env: TARGET=mips64-unknown-linux-gnuabi64 DISABLE_BENCHES=1 NO_FEATURES=1
    - env: TARGET=mips64el-unknown-linux-gnuabi64 DISABLE_BENCHES=1 NO_FEATURES=1
    - env: TARGET=mipsel-unknown-linux-gnu DISABLE_BENCHES=1 NO_FEATURES=1
    - env: TARGET=powerpc-unknown-linux-gnu DISABLE_BENCHES=1 NO_FEATURES=1
    - env: TARGET=powerpc64-unknown-linux-gnu DISABLE_TESTS=1 NO_FEATURES=1
    - env: TARGET=powerpc64le-unknown-linux-gnu DISABLE_BENCHES=1 NO_FEATURES=1

Where you can use either PowerPC (which is native big-endian, but also has little-endian mode in 64-bit), or MIPS (same thing, different architecture), which both have 32-bit and 64-bit.

lemire commented 3 years ago

I have a POWER9 system I can give you access to (ssh/linux) if that's ever useful.

Alexhuszagh commented 3 years ago

@lemire That might be useful to triage a few correctness issues, I've been running comprehensive tests on a lot of platforms. I think the issue is due to errors in soft-floats, and therefore an artifact and not a real issue, but I'd rather be cautious. I'll also run them using your C++ version just to make sure.

aldanor / fast-float-rust

Remove Unncessary Unsafe and Endian-Dependent Codepaths #19

Issue

Solution

Assembly