Open ytdHuang opened 2 days ago
@maleadt
You mentioned about just copying the implementation in CUDA.jl
and paste it here.
Do you mean something like the following @@ ?
I just changed CuMatrix
to AbstractGPUMatrix
, but what about the kernel = @cuda
stuff ?
Sorry that I'm not so familiar with the development of GPU codes...
function LinearAlgebra.kron!(C::AbstractGPUMatrix{TC}, A::AbstractGPUMatrix{TA}, B::AbstractGPUMatrix{TB}) where {TA,TB,TC}
function _kron_mat_kernelA!(C, A, B, m, n, p, q)
index_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
index_j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
stride_i = blockDim().x * gridDim().x
stride_j = blockDim().y * gridDim().y
index_i > m && return
index_j > n && return
for i in index_i:stride_i:m
for j in index_j:stride_j:n
for k in 1:p
for l in 1:q
@inbounds C[(i-1)*p+k, (j-1)*q+l] = A[i,j] * B[k,l]
end
end
end
end
return nothing
end
function _kron_mat_kernelB!(C, A, B, m, n, p, q)
index_p = (blockIdx().x - 1) * blockDim().x + threadIdx().x
index_q = (blockIdx().y - 1) * blockDim().y + threadIdx().y
stride_p = blockDim().x * gridDim().x
stride_q = blockDim().y * gridDim().y
index_p > p && return
index_q > q && return
for i in 1:m
for j in 1:n
for k in index_p:stride_p:p
for l in index_q:stride_q:q
@inbounds C[(i-1)*p+k, (j-1)*q+l] = A[i,j] * B[k,l]
end
end
end
end
return nothing
end
m, n = size(A)
p, q = size(B)
# Use different kernels depending on the size of the matrices
# choosing to parallelize the matrix with the largest number of elements
m*n >= p*q ? (kernel = @cuda launch=false _kron_mat_kernelA!(C, A, B, m, n, p, q)) :
(kernel = @cuda launch=false _kron_mat_kernelB!(C, A, B, m, n, p, q))
m*n >= p*q ? (sizes = (m, n)) : (sizes = (p, q))
config = launch_configuration(kernel.fun)
dim_ratio = sizes[1] / sizes[2]
max_threads_i = max(1, floor(Int, sqrt(config.threads * dim_ratio)))
max_threads_j = max(1, floor(Int, sqrt(config.threads / dim_ratio)))
max_blocks_i = max(1, floor(Int, sqrt(config.blocks * dim_ratio)))
max_blocks_j = max(1, floor(Int, sqrt(config.blocks / dim_ratio)))
threads_i = min(sizes[1], max_threads_i)
threads_j = min(sizes[2], max_threads_j)
threads = (threads_i, threads_j)
blocks_i = min(cld(sizes[1], threads_i), max_blocks_i)
blocks_j = min(cld(sizes[2], threads_j), max_blocks_j)
blocks = (blocks_i, blocks_j)
kernel(C, A, B, m, n, p, q; threads=threads, blocks=blocks)
return C
end
function LinearAlgebra.kron(A::AbstractGPUMatrix{TA}, B::AbstractGPUMatrix{TB}) where {TA,TB}
m, n = size(A)
p, q = size(B)
T = promote_type(TA, TB)
C = similar(A, T, m*p, n*q)
kron!(C, A, B)
end
Ah, I glanced over the kernels. Yeah those will have to be ported to GPUArrays' DSL (gpu_call
), or wait for the impending switch over to KernelAbstractions.jl.
Hmm, not sure how to contribute
I checked the documentation of GPUArrays.jl
, but I don't think I understand lol.
I think you guys define blockidx
, blockdim
, and threadidx
here, and it will be called depend on different backends during execution ?
Ah, I glanced over the kernels. Yeah those will have to be ported to GPUArrays' DSL (
gpu_call
), or wait for the impending switch over to KernelAbstractions.jl.
Okay, maybe I will just wait
I think you guys define
blockidx
,blockdim
, andthreadidx
here, and it will be called depend on different backends during execution ?
The docs are pretty bad, it's easier to look at some existing kernels. It's a very shallow abstraction over the CUDA stuff you're probably familiar with.
But waiting for KA.jl is probably fine as well, if you don't need this urgently.
I think you guys define
blockidx
,blockdim
, andthreadidx
here, and it will be called depend on different backends during execution ?The docs are pretty bad, it's easier to look at some existing kernels. It's a very shallow abstraction over the CUDA stuff you're probably familiar with.
But waiting for KA.jl is probably fine as well, if you don't need this urgently.
So you guys are making GPUArrays.jl
depend on KA.jl
?
May I ask how long would it take (approximately) ? Just want to get the picture.
Yes, see the open PRs here and on the various back-ends. Hard to tell how long it would take. Hopefully in the order of weeks, at most months.
As @maleadt mentioned in https://github.com/JuliaGPU/Metal.jl/issues/422.
I re-open a new issue here.
The current
LinearAlgebra.kron
only supports forCuArray
, and the otherGPUArray
uses scalar indexing.Also, the methods for Kronecker producting
Transpose{T,<:AbstractGPUArray}
orAdjoint{T,<:AbstractGPUArray}
are missing.