using BitIntegers
using BenchmarkTools
Base.widen(::Type{UInt128}) = UInt256
@inline function highmul(x::T, y::T) where {T<:Integer}
z = widemul(x, y)
unsafe_trunc(T, z >> 8sizeof(T))
end
narrow(::Type{UInt128}) = UInt64
hilo(c::Integer) = unsafe_trunc(narrow(typeof(c)), c >> 4sizeof(c)), rem(c, narrow(typeof(c)))
@inline function highmul2(x::T, y::T) where {T<:Integer}
a, b = hilo(x)
c, d = hilo(y)
widemul(a, c) + highmul(b, c) + highmul(a, d)
end
const UINT128_CONST = UInt128(309484926810737527165550592)
function benchmark(highmul, a::Vector{UInt128})
for i = 1:length(a)
a[i] = highmul(a[i], UINT128_CONST)
end
end
a = rand(UInt128, 10000)
@benchmark benchmark(highmul, a)
@benchmark benchmark(highmul2, a)
We get:
julia> @benchmark benchmark(highmul, a)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 54.031 μs (0.00% GC)
median time: 58.583 μs (0.00% GC)
mean time: 60.844 μs (0.00% GC)
maximum time: 2.154 ms (0.00% GC)
--------------
samples: 10000
evals/sample: 1
julia> @benchmark benchmark(highmul2, a)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 25.003 μs (0.00% GC)
median time: 26.454 μs (0.00% GC)
mean time: 27.424 μs (0.00% GC)
maximum time: 275.960 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
i.e. the second function is 2x faster despite being equivalent. This is a missing InstCombine transformation in LLVM.
Consider the following benchmark:
We get:
i.e. the second function is 2x faster despite being equivalent. This is a missing InstCombine transformation in LLVM.