Inefficient code generation for Int128 highmul

Keno commented 4 years ago

Consider the following benchmark:

using BitIntegers
using BenchmarkTools

Base.widen(::Type{UInt128}) = UInt256

@inline function highmul(x::T, y::T) where {T<:Integer}
    z = widemul(x, y)
    unsafe_trunc(T, z >> 8sizeof(T))
end

narrow(::Type{UInt128}) = UInt64
hilo(c::Integer) = unsafe_trunc(narrow(typeof(c)), c >> 4sizeof(c)), rem(c, narrow(typeof(c)))
@inline function highmul2(x::T, y::T) where {T<:Integer}
    a, b = hilo(x)
    c, d = hilo(y)
    widemul(a, c) + highmul(b, c) + highmul(a, d)
end

const UINT128_CONST = UInt128(309484926810737527165550592)
function benchmark(highmul, a::Vector{UInt128})
    for i = 1:length(a)
        a[i] = highmul(a[i], UINT128_CONST)
    end
end

a = rand(UInt128, 10000)
@benchmark benchmark(highmul, a)
@benchmark benchmark(highmul2, a)

We get:

julia> @benchmark benchmark(highmul, a)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     54.031 μs (0.00% GC)
  median time:      58.583 μs (0.00% GC)
  mean time:        60.844 μs (0.00% GC)
  maximum time:     2.154 ms (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark benchmark(highmul2, a)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     25.003 μs (0.00% GC)
  median time:      26.454 μs (0.00% GC)
  mean time:        27.424 μs (0.00% GC)
  maximum time:     275.960 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

i.e. the second function is 2x faster despite being equivalent. This is a missing InstCombine transformation in LLVM.

Keno commented 4 years ago

Note to self: A related issue is that TargetLowering::BuildUDIV doesn't support non-legal integer types.

oscardssmith commented 4 years ago

bump?

JuliaLang / julia

Inefficient code generation for Int128 highmul #33854