JuliaLinearAlgebra / Octavian.jl

Multi-threaded BLAS-like library that provides pure Julia matrix multiplication
https://julialinearalgebra.github.io/Octavian.jl/stable/
Other
226 stars 18 forks source link

don't multithread x86 #146

Closed chriselrod closed 2 years ago

chriselrod commented 2 years ago

The problem looks like it isn't specific to x86.

codecov[bot] commented 2 years ago

Codecov Report

Merging #146 (f14d386) into master (db713f3) will decrease coverage by 0.65%. The diff coverage is 90.66%.

@@            Coverage Diff             @@
##           master     #146      +/-   ##
==========================================
- Coverage   88.72%   88.06%   -0.66%     
==========================================
  Files          13       13              
  Lines         878      930      +52     
==========================================
+ Hits          779      819      +40     
- Misses         99      111      +12     
Impacted Files Coverage Δ
src/matmul.jl 91.84% <40.00%> (-1.00%) :arrow_down:
src/forward_diff.jl 98.85% <98.46%> (+1.07%) :arrow_up:
src/init.jl 64.28% <0.00%> (-14.29%) :arrow_down:
src/global_constants.jl 61.22% <0.00%> (-8.17%) :arrow_down:

Continue to review full report at Codecov.

Legend - Click here to learn more Δ = absolute <relative> (impact), ø = not affected, ? = missing data Powered by Codecov. Last update db713f3...f14d386. Read the comment docs.

chriselrod commented 2 years ago

ReadOnlyMemoryError()s definitely look real.

chriselrod commented 2 years ago

Starting julia with --check-bounds=yes, and running

macro turbo(ex); esc(ex); end
macro tturbo(ex); esc(ex); end
using ForwardDiff, LinearAlgebra, Static, Test, ArrayInterface
using Static: One, Zero
using ArrayInterface: indices

include("/home/chriselrod/.julia/dev/Octavian/src/forward_diff.jl")

@inline function alloc_matmul_product(A::AbstractArray{TA}, B::AbstractMatrix{TB}) where {TA,TB}
  # TODO: if `M` and `N` are statically sized, shouldn't return a `Matrix`.
  M, KA = size(A)
  KB, N = size(B)
  @assert KA == KB "Size mismatch."
  if M === StaticInt(1)
    transpose(Vector{promote_type(TA,TB)}(undef, N)), (M, KA, N)
  else
    Matrix{promote_type(TA,TB)}(undef, M, N), (M, KA, N)
  end
end
@inline function alloc_matmul_product(A::AbstractArray{TA}, B::AbstractVector{TB}) where {TA,TB}
  # TODO: if `M` and `N` are statically sized, shouldn't return a `Matrix`.
  M, KA = size(A)
  KB = length(B)
  @assert KA == KB "Size mismatch."
  Vector{promote_type(TA,TB)}(undef, M), (M, KA, One())
end

"""
    matmul(A, B)

Multiply matrices `A` and `B`.
"""
@inline function matmul(A::AbstractMatrix, B::AbstractVecOrMat, α, β)
  C, (M,K,N) = alloc_matmul_product(A, B)
  matmul!(C, A, B, α, β, nothing, (M,K,N), ArrayInterface.contiguous_axis(C))
  return C
end
@inline matmul(A::AbstractMatrix, B::AbstractVecOrMat) = matmul(A, B, One(), Zero())
@inline matmul(A::AbstractMatrix, B::AbstractVecOrMat, α) = matmul(A, B, α, Zero())

"""
    matmul!(C, A, B[, α, β, max_threads])

Calculates `C = α * A * B + β * C` in place, overwriting the contents of `C`.
It may use up to `max_threads` threads. It will not use threads when nested in other threaded code.
"""
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat)
    matmul!(C, A, B, One(), Zero(), nothing, nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α)
    matmul!(C, A, B, α, Zero(), nothing, nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β)
    matmul!(C, A, B, α, β, nothing, nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, nthread)
    matmul!(C, A, B, α, β, nthread, nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, nthread, ::Nothing, ::StaticInt{2})
  _matmul!(transpose(C), transpose(B), transpose(A), α, β, nthread, nothing)
  return C
end
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, nthread, (M,K,N)::Tuple{Vararg{Integer,3}}, ::StaticInt{2})
  _matmul!(transpose(C), transpose(B), transpose(A), α, β, nthread, (N,K,M))
  return C
end
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, nthread, MKN, ::StaticInt)
  _matmul!(C, A, B, α, β, nthread, MKN)
  return C
end

@inline function matmul_serial(A::AbstractMatrix, B::AbstractVecOrMat)
  C, (M,K,N) = alloc_matmul_product(A, B)
  matmul_serial!(C, A, B, One(), Zero(), (M,K,N), ArrayInterface.contiguous_axis(C))
  return C
end
@inline function matmul_serial(A::AbstractMatrix, B::AbstractVecOrMat, α)
  C, (M,K,N) = alloc_matmul_product(A, B)
  matmul_serial!(C, A, B, α, Zero(), (M,K,N), ArrayInterface.contiguous_axis(C))
  return C
end
@inline function matmul_serial(A::AbstractMatrix, B::AbstractVecOrMat, α, β)
  C, (M,K,N) = alloc_matmul_product(A, B)
  matmul_serial!(C, A, B, α, β, (M,K,N), ArrayInterface.contiguous_axis(C))
  return C
end
@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat)
    matmul_serial!(C, A, B, One(), Zero(), nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α)
    matmul_serial!(C, A, B, α, Zero(), nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β)
    matmul_serial!(C, A, B, α, β, nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, ::Nothing, ::StaticInt{2})
  _matmul_serial!(transpose(C), transpose(B), transpose(A), α, β, nothing)
  return C
end
@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, (M,K,N)::Tuple{Vararg{Integer,3}}, ::StaticInt{2})
  _matmul_serial!(transpose(C), transpose(B), transpose(A), α, β, (N,K,M))
  return C
end
@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, MKN, ::StaticInt)
  _matmul_serial!(C, A, B, α, β, MKN)
  return C
end

_matmul_serial!(C,A,B, α, β, MKN) = mul!(C,A,B,convert(Float64,α), convert(Float64,β))
_matmul!(C,A,B, α, β, nthread, MKN) = mul!(C,A,B,convert(Float64,α), convert(Float64,β))

const Octavian = Main
include("/home/chriselrod/.julia/dev/Octavian/test/forward_diff.jl")

shows we at least don't have bounds errors, but I'm not really sure how to interpret this ReadOnllyMemoryError():

Internal error: encountered unexpected error in runtime:
ReadOnlyMemoryError()
ERROR: Package Octavian errored during testing (exit code: 541541187)
Stacktrace:
 [1] pkgerror(msg::String)
   @ Pkg.Types C:\hostedtoolcache\windows\julia\nightly\x64\share\julia\stdlib\v1.9\Pkg\src\Types.jl:68
 [2] test(ctx::Pkg.Types.Context, pkgs::Vector{Pkg.Types.PackageSpec}; coverage::Bool, julia_args::Cmd, test_args::Cmd, test_fn::Nothing, force_latest_compatible_version::Bool, allow_earlier_backwards_compatible_versions::Bool, allow_reresolve::Bool)
   @ Pkg.Operations C:\hostedtoolcache\windows\julia\nightly\x64\share\julia\stdlib\v1.9\Pkg\src\Operations.jl:1808
 [3] test(ctx::Pkg.Types.Context, pkgs::Vector{Pkg.Types.PackageSpec}; coverage::Bool, test_fn::Nothing, julia_args::Vector{String}, test_args::Cmd, force_latest_compatible_version::Bool, allow_earlier_backwards_compatible_versions::Bool, allow_reresolve::Bool, kwargs::Base.Pairs{Symbol, IOContext{Base.PipeEndpoint}, Tuple{Symbol}, NamedTuple{(:io,), Tuple{IOContext{Base.PipeEndpoint}}}})
   @ Pkg.API C:\hostedtoolcache\windows\julia\nightly\x64\share\julia\stdlib\v1.9\Pkg\src\API.jl:431
 [4] test(pkgs::Vector{Pkg.Types.PackageSpec}; io::IOContext{Base.PipeEndpoint}, kwargs::Base.Pairs{Symbol, Any, Tuple{Symbol, Symbol, Symbol}, NamedTuple{(:coverage, :julia_args, :force_latest_compatible_version), Tuple{Bool, Vector{String}, Bool}}})
   @ Pkg.API C:\hostedtoolcache\windows\julia\nightly\x64\share\julia\stdlib\v1.9\Pkg\src\API.jl:156
 [5] test(; name::Nothing, uuid::Nothing, version::Nothing, url::Nothing, rev::Nothing, path::Nothing, mode::Pkg.Types.PackageMode, subdir::Nothing, kwargs::Base.Pairs{Symbol, Any, Tuple{Symbol, Symbol, Symbol}, NamedTuple{(:coverage, :julia_args, :force_latest_compatible_version), Tuple{Bool, Vector{String}, Bool}}})
   @ Pkg.API C:\hostedtoolcache\windows\julia\nightly\x64\share\julia\stdlib\v1.9\Pkg\src\API.jl:171
 [6] top-level scope
   @ none:1

Seems like it is an internal Julia error? The error is 32-bit Windows only.

DilumAluthge commented 2 years ago

@chriselrod Could you post a brief description explaining why we need to disable multithreading on 32-bit?