ffevotte / StochasticArithmetic.jl

Stochastic Arithmetic to diagnose Floating-Point problems in Julia
Other
18 stars 6 forks source link

fyi: error-free transformations #3

Open JeffreySarnoff opened 5 years ago

JeffreySarnoff commented 5 years ago

I have spent a good amount of time on optimizing error-free transformations in Julia. q.v. https://github.com/JeffreySarnoff/ErrorfreeArithmetic.jl in particular

const FMAFloat = Union{Float64, Float32}

function two_sum(a::T, b::T) where {T<:FMAFloat}
    hi = a + b
    a1 = hi - b
    b1 = hi - a1
    lo = (a - a1) + (b - b1)
    return hi, lo
end

function two_diff(a::T, b::T) where {T<:FMAFloat}
    hi = a - b
    a1 = hi + b
    b1 = hi - a1
    lo = (a - a1) - (b + b1)
    return hi, lo
end

 function two_prod(a::T, b::T) where {T<:FMAFloat}
    p = a * b
    e = fma(a, b, -p)
    p, e
end

# note that `two_div` is not always error-free, it is always faithful
# abs(true - two_div) < 1ulp

function two_div(a::T, b::T) where {T<:FMAFloat}
     hi = a / b
     lo = fma(-hi, b, a)
     lo /= b
     return hi, lo
end
ffevotte commented 5 years ago

Thanks ! I should definitely use these if they exist elsewhere and have been optimized.

On a related note, what do you think of using double precision arithmetic to define specialized versions of EFTs for single precision floats? I tend to do things like this because I would think it is more efficient than using algorithms such as TwoSum or TwoProd, but I have never actually timed both versions to make sure of that.

JeffreySarnoff commented 5 years ago

The function definitions I listed above automatically specialize for Float64s and Float32s. Using Float64s to emulate error-free transformations of Float32s could work because the number of significand bits in a Float64 is more than twice the number of significand bits in a Float32.

function twosum(a::Float32, b::Float32)
      hilo = Float64(a) + Float64(b)
      hi = Float32(hilo)
      lo = Float32(hilo - hi)
      return hi, lo
end

function twodiff(a::Float32, b::Float32)
      hilo = Float64(a) - Float64(b)
      hi = Float32(hilo)
      lo = Float32(hilo - hi)
      return hi, lo
end

function twoprod(a::Float32, b::Float32)
      hilo = Float64(a) * Float64(b)
      hi = Float32(hilo)
      lo = Float32(hilo - hi)
      return hi, lo
end

function twodiv(a::Float32, b::Float32)
      hilo = Float64(a) / Float64(b)
      hi = Float32(hilo)
      lo = Float32(hilo - hi)
      return hi, lo
end

I benchmark these as 10%-12% faster.