using LoopVectorization
function mul_trace(A::AbstractMatrix, B::AbstractMatrix)
sA, sB = size(A), size(B)
@assert (sA === sB) && (length(sA) === 2) && (first(sA) === last(sA))
result = zero(promote_type(eltype(A), eltype(B)))
n = first(sA)
@turbo for i in 1:n
for j in 1:n
result += A[i, j] * B[j, i]
end
end
return result
end
using Test
@testset "mul_trace" begin
rng = MersenneTwister(1234)
for size in 2:4, T1 in (Float32, Float64), T2 in (Float32, Float64)
A = rand(rng, T1, size, size)
B = rand(rng, T2, size, size)
@test mul_trace(A, B) ≈ tr(A * B)
end
end
Works perfectly fine locally on my machine (macos). I could not reproduce it locally, but sometimes it fails and sometimes it doesn't on our CI. Link. It makes me think that the issue is machine/OS related.
The
mul_trace
function gives uninformative error when called withFloat32
matrices.The function itself simply computes
tr(A * B)
:Works perfectly fine locally on my machine (macos). I could not reproduce it locally, but sometimes it fails and sometimes it doesn't on our CI. Link. It makes me think that the issue is machine/OS related.