aldanor / fast-float-rust

Super-fast float parser in Rust (now part of Rust core)
https://docs.rs/fast-float
Apache License 2.0
275 stars 20 forks source link

Consider Adding Support for Disguised Fast-Path Cases #16

Closed Alexhuszagh closed 3 years ago

Alexhuszagh commented 3 years ago

Issue

fast-float-rust does not handle disguised fast path cases, such as "1.2345e30", and therefore falls back to compute_float when it's unnecessary. The changes are trivial, and lead to ~30% performance improvements for disguised fast-path cases without affecting other benchmarks. See rust-dec2flt for exact specifics on these benchmarks.

Benchmarks

These benchmarks were run on an i7-6560U CPU @ 2.20GHz, on a target of x86_64-unknown-linux-gnu, running kernel version 5.11.16-100, on a Rust version of rustc 1.53.0-nightly (132b4e5d1 2021-04-13). The performance CPU governor was used for all benchmarks, and were run consecutively on A/C power with only tmux and Sublime Text open for all benchmarks. The floats that were parsed are as follows:

// Example fast-path value.
const FAST: &str = "1.2345e22";
// Example disguised fast-path value.
const DISGUISED: &str = "1.2345e30";
// Example moderate path value: clearly not halfway `1 << 53`.
const MODERATE: &str = "9007199254740992.0";
// Example exactly-halfway value `(1<<53) + 1`.
const HALFWAY: &str = "9007199254740993.0";
// Example large, near-halfway value.
const LARGE: &str = "8.988465674311580536566680e307";
// Example denormal, near-halfway value.
const DENORMAL: &str = "8.442911973260991817129021e-309";

disguised

fast-float/fast         time:   [20.446 ns 20.530 ns 20.625 ns]
fast-float/disguised    time:   [20.771 ns 20.814 ns 20.870 ns]
fast-float/moderate     time:   [34.970 ns 35.268 ns 35.638 ns]
fast-float/halfway      time:   [34.346 ns 34.794 ns 35.337 ns]
fast-float/large        time:   [14.346 us 14.439 us 14.549 us]
fast-float/denormal     time:   [72.166 ns 72.652 ns 73.251 ns]

master

Note: I ran these benchmarks numerous times, to try to ensure fast-float/fast was at least as fast as those for the disguised case, but they never were. Since this introduced an additional branch when parsing, we should assume that the fast-path case is at least as fast here as on the disguised branch.

fast-float/fast         time:   [21.513 ns 21.614 ns 21.727 ns]
fast-float/disguised    time:   [30.283 ns 30.567 ns 30.903 ns]
fast-float/moderate     time:   [34.232 ns 34.408 ns 34.615 ns]
fast-float/halfway      time:   [34.598 ns 34.734 ns 34.897 ns]
fast-float/large        time:   [13.897 us 13.944 us 13.999 us]
fast-float/denormal     time:   [76.306 ns 76.799 ns 77.338 ns]

DIff

The changes are trivial, and do not affect binary size:

diff --git a/src/float.rs b/src/float.rs
index 39bec41..b31f1f2 100644
--- a/src/float.rs
+++ b/src/float.rs
@@ -5,6 +5,25 @@ mod private {
     pub trait Sealed {}
 }

+pub const INT_POW10: [u64; 16] = [
+    1,
+    10,
+    100,
+    1000,
+    10000,
+    100000,
+    1000000,
+    10000000,
+    100000000,
+    1000000000,
+    10000000000,
+    100000000000,
+    1000000000000,
+    10000000000000,
+    100000000000000,
+    1000000000000000,
+];
+
 #[doc(hidden)]
 pub trait Float:
     Sized
@@ -31,6 +50,7 @@ pub trait Float:
     const MAX_EXPONENT_ROUND_TO_EVEN: i32;
     const MIN_EXPONENT_FAST_PATH: i64;
     const MAX_EXPONENT_FAST_PATH: i64;
+    const MAX_DISGUISED_EXPONENT_FAST_PATH: i64;
     const MINIMUM_EXPONENT: i32;
     const INFINITE_POWER: i32;
     const SIGN_INDEX: usize;
@@ -56,6 +76,7 @@ impl Float for f32 {
     const MAX_EXPONENT_ROUND_TO_EVEN: i32 = 10;
     const MIN_EXPONENT_FAST_PATH: i64 = -10; // assuming FLT_EVAL_METHOD = 0
     const MAX_EXPONENT_FAST_PATH: i64 = 10;
+    const MAX_DISGUISED_EXPONENT_FAST_PATH: i64 = 17;
     const MINIMUM_EXPONENT: i32 = -127;
     const INFINITE_POWER: i32 = 0xFF;
     const SIGN_INDEX: usize = 31;
@@ -90,6 +111,7 @@ impl Float for f64 {
     const MAX_EXPONENT_ROUND_TO_EVEN: i32 = 23;
     const MIN_EXPONENT_FAST_PATH: i64 = -22; // assuming FLT_EVAL_METHOD = 0
     const MAX_EXPONENT_FAST_PATH: i64 = 22;
+    const MAX_DISGUISED_EXPONENT_FAST_PATH: i64 = 37;
     const MINIMUM_EXPONENT: i32 = -1023;
     const INFINITE_POWER: i32 = 0x7FF;
     const SIGN_INDEX: usize = 63;
diff --git a/src/number.rs b/src/number.rs
index ecb76c2..4c78c97 100644
--- a/src/number.rs
+++ b/src/number.rs
@@ -1,5 +1,5 @@
 use crate::common::{is_8digits_le, AsciiStr, ByteSlice};
-use crate::float::Float;
+use crate::float::{Float, INT_POW10};

 const MIN_19DIGIT_INT: u64 = 100_0000_0000_0000_0000;

@@ -15,14 +15,15 @@ impl Number {
     #[inline]
     fn is_fast_path<F: Float>(&self) -> bool {
         F::MIN_EXPONENT_FAST_PATH <= self.exponent
-            && self.exponent <= F::MAX_EXPONENT_FAST_PATH
+            && self.exponent <= F::MAX_DISGUISED_EXPONENT_FAST_PATH
             && self.mantissa <= F::MAX_MANTISSA_FAST_PATH
             && !self.many_digits
     }

     #[inline]
     pub fn try_fast_path<F: Float>(&self) -> Option<F> {
-        if self.is_fast_path::<F>() {
+        let is_fast = self.is_fast_path::<F>();
+        if is_fast && self.exponent <= F::MAX_EXPONENT_FAST_PATH {
             let mut value = F::from_u64(self.mantissa);
             if self.exponent < 0 {
                 value = value / F::pow10_fast_path((-self.exponent) as _);
@@ -33,6 +34,19 @@ impl Number {
                 value = -value;
             }
             Some(value)
+        } else if is_fast {
+            let max_exp = F::MAX_EXPONENT_FAST_PATH;
+            let shift = self.exponent - max_exp;
+            let mant = self.mantissa.checked_mul(INT_POW10[shift as usize])?;
+            if mant > F::MAX_MANTISSA_FAST_PATH {
+                return None;
+            }
+            let mut value = F::from_u64(mant);
+            value = value * F::pow10_fast_path(max_exp as _);
+            if self.negative {
+                value = -value;
+            }
+            Some(value)
         } else {
             None
         }