JuliaSIMD / LoopVectorization.jl

Macro(s) for vectorizing loops.
MIT License
742 stars 66 forks source link

Odd `UndefVarError` caused by `LoopVectorization.jl` #377

Open bvdmitri opened 2 years ago

bvdmitri commented 2 years ago

The mul_trace function gives uninformative error when called with Float32 matrices.

mul_trace: Error During Test at /home/runner/work/ReactiveMP.jl/ReactiveMP.jl/test/algebra/test_helpers.jl:78
  Test threw exception
  Expression: ReactiveMP.mul_trace(A, B) ≈ tr(A * B)
  UndefVarError: ####op#279__0 not defined
  Stacktrace:
   [1] macro expansion
     @ ~/.julia/packages/LoopVectorization/ndGJi/src/reconstruct_loopset.jl:713 [inlined]
   [2] _turbo_!(::Val{(false, 0, 0, 0, false, 16, 64, 32, 64, 32768, 1048576, 37486592, 0x0000000000000001)}, ::Val{(:LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x00000000000000000000000000000012, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memload, 0x0001, 0x01), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x00000000000000000000000000000021, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memload, 0x0002, 0x02), Symbol("##DROPPED#CONSTANT##"), Symbol("##DROPPED#CONSTANT##"), LoopVectorization.OperationStruct(0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000012, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.constant, 0x0003, 0x00), :LoopVectorization, :vfmadd_fast, LoopVectorization.OperationStruct(0x00000000000000000000000000000012, 0x00000000000000000000000000000012, 0x00000000000000000000000000000000, 0x00000000000000000000000100020003, 0x00000000000000000000000000000000, LoopVectorization.compute, 0x0003, 0x00))}, ::Val{(LoopVectorization.ArrayRefStruct{:A, Symbol("##vptr##_A")}(0x00000000000000000000000000000101, 0x00000000000000000000000000000102, 0x00000000000000000000000000000000, 0x00000000000000000000000000000101), LoopVectorization.ArrayRefStruct{:B, Symbol("##vptr##_B")}(0x00000000000000000000000000000101, 0x00000000000000000000000000000201, 0x00000000000000000000000000000000, 0x00000000000000000000000000000101))}, ::Val{(0, (4,), (3,), (), (), (), ())}, ::Val{(:i, :j)}, ::Val{Tuple{Tuple{CloseOpenIntervals.CloseOpen{Static.StaticInt{0}, Int64}, CloseOpenIntervals.CloseOpen{Static.StaticInt{0}, Int64}}, Tuple{LayoutPointers.GroupedStridedPointers{Tuple{Ptr{Float32}, Ptr{Float32}}, (1, 1), (0, 0), ((1, 2), (1, 2)), ((1, 2), (3, 4)), Tuple{Static.StaticInt{4}, Int64, Static.StaticInt{4}, Int64}, NTuple{4, Static.StaticInt{0}}}, DataType}}}, ::Int64, ::Int64, ::Ptr{Float32}, ::Ptr{Float32}, ::Int64, ::Int64, ::LoopVectorization.StaticType{Float32})
     @ LoopVectorization ~/.julia/packages/LoopVectorization/ndGJi/src/reconstruct_loopset.jl:713
   [3] mul_trace(A::Matrix{Float32}, B::Matrix{Float32})

The function itself simply computes tr(A * B):

using LoopVectorization

function mul_trace(A::AbstractMatrix, B::AbstractMatrix)
    sA, sB = size(A), size(B)
    @assert (sA === sB) && (length(sA) === 2) && (first(sA) === last(sA))
    result = zero(promote_type(eltype(A), eltype(B)))
    n = first(sA)
    @turbo for i in 1:n
        for j in 1:n
            result += A[i, j] * B[j, i]
        end
    end
    return result
end
using Test

@testset "mul_trace" begin 
    rng = MersenneTwister(1234)

    for size in 2:4, T1 in (Float32, Float64), T2 in (Float32, Float64)
        A = rand(rng, T1, size, size)
        B = rand(rng, T2, size, size)
        @test mul_trace(A, B) ≈ tr(A * B)
    end

end

Works perfectly fine locally on my machine (macos). I could not reproduce it locally, but sometimes it fails and sometimes it doesn't on our CI. Link. It makes me think that the issue is machine/OS related.

chriselrod commented 2 years ago

It makes me think that the issue is machine/OS related.

I'm assuming it only shows up on systems with AVX512.