Closed shipengcheng1230 closed 3 years ago
julia> using LinearAlgebra
julia> function AmulB!(y::AbstractVector, A::AbstractMatrix, x::AbstractVector)
@avxt for m ∈ indices((A,y),1)
ym = zero(eltype(y))
for n ∈ indices((A,x),(2,1))
ym += A[m,n] * x[n]
end
y[m] = ym
end
end
AmulB! (generic function with 1 method)
julia> cv = vec(c); bv = vec(b);
julia> @btime matmul!($c, $a, $b);
616.434 ns (0 allocations: 0 bytes)
julia> @btime mul!($c, $a, $b);
3.679 μs (0 allocations: 0 bytes)
julia> @btime mul!($cv, $a, $bv);
1.247 μs (0 allocations: 0 bytes)
julia> @btime AmulB!($cv, $a, $bv);
434.470 ns (0 allocations: 0 bytes)
matmul!
is already faster than gemm-mul!
and gemv-mul!
for me at this size.
Maybe I could just add something like AmulB!
above for implementing an optimized version.
However, increasing n
to 1000
makes this AmulB
slower than matmul!
, which becomes slower than both mul!
s:
julia> n = 1000;
julia> a = rand(n, n);
julia> b = rand(n, 1); bv = vec(b);
julia> c = zeros(n, 1); cv = vec(c);
julia> @btime matmul!($c, $a, $b);
150.031 μs (0 allocations: 0 bytes)
julia> @btime mul!($c, $a, $b);
115.170 μs (0 allocations: 0 bytes)
julia> @btime mul!($cv, $a, $bv);
82.197 μs (0 allocations: 0 bytes)
julia> @btime AmulB!($cv, $a, $bv);
219.726 μs (0 allocations: 0 bytes)
With the recent update of LoopVectorization, I found yours is better than gemv!
:
julia> using BenchmarkTools, LinearAlgebra, LoopVectorization
julia> BLAS.set_num_threads(Threads.nthreads()) # 24 cores
julia> function AmulB!(y::AbstractVector{T}, A::AbstractMatrix{T}, x::AbstractVector{T}, α::T, β::T) where T
@tturbo for i ∈ indices((A, y), 1)
yi = zero(eltype(y))
for j ∈ indices((A, x), (2, 1))
yi += A[i, j] * x[j]
end
y[i] = α * yi + β * y[i]
end
end
AmulB! (generic function with 1 method)
julia> N = 1000; a = rand(N, N); b = rand(N); c = similar(b);
julia> @btime mul!(c, a, b, true, true);
50.226 μs (0 allocations: 0 bytes)
julia> @btime AmulB!(c, a, b, 1.0, 1.0);
7.710 μs (0 allocations: 0 bytes) # Huge difference!
julia> N = 10000; a = rand(N, N); b = rand(N); c = similar(b);
julia> @btime mul!(c, a, b, true, true);
12.050 ms (0 allocations: 0 bytes)
julia> @btime AmulB!(c, a, b, 1.0, 1.0);
10.024 ms (0 allocations: 0 bytes) # better!
julia> N = 20000; a = rand(N, N); b = rand(N); c = similar(b);
julia> @btime mul!(c, a, b, true, true);
46.879 ms (0 allocations: 0 bytes)
julia> @btime AmulB!(c, a, b, 1.0, 1.0);
44.089 ms (0 allocations: 0 bytes) # better!
Not a comprehensive benchmark though, but for the size of my case your implementation is better!
Wonderful, thanks!
Hi, I am wondering if a matrix vector product will be implemented, currently using matrix matrix product is slow:
Thank you for such amazing package!