dahong67 / GCPDecompositions.jl

Generalized CP Tensor Decompositions
MIT License
5 stars 3 forks source link

Support CUDA arrays #36

Open dahong67 opened 10 months ago

dahong67 commented 10 months ago

Idea is to add GPU support by making the gcp implementation sufficiently generic so that it can work with CuArrays. Can also add CuArray specific methods via package extensions.

Should probably wait until current work on #35 and perhaps #34 are done.

Tagging @alexmul1114 who will be working on this as part of his winter fellowship.

dahong67 commented 10 months ago

Quick test on caviness at UD (interactive session with 16G of memory and a GPU) of the Khatri-Rao product implementation from https://github.com/dahong67/GCPDecompositions.jl/issues/34#issue-1995001913

               _
   _       _ _(_)_     |  Documentation: https://docs.julialang.org
  (_)     | (_) (_)    |
   _ _   _| |_  __ _   |  Type "?" for help, "]?" for Pkg help.
  | | | | | | |/ _` |  |
  | | |_| | | | (_| |  |  Version 1.9.4 (2023-11-14)
 _/ |\__'_|_|_|\__'_|  |  Official https://julialang.org/ release
|__/                   |

(gpu-cuda) pkg> st
Status `~/gpu-cuda/Project.toml`
⌃ [052768ef] CUDA v5.0.0
Info Packages marked with ⌃ have new versions available and may be upgradable.

julia> using CUDA

julia> CUDA.versioninfo()
CUDA runtime 12.2, artifact installation
CUDA driver 12.1
NVIDIA driver 530.30.2

CUDA libraries: 
- CUBLAS: 12.2.5
- CURAND: 10.3.3
- CUFFT: 11.0.8
- CUSOLVER: 11.5.2
- CUSPARSE: 12.1.2
- CUPTI: 20.0.0
- NVML: 12.0.0+530.30.2

Julia packages: 
- CUDA: 5.0.0
- CUDA_Driver_jll: 0.6.0+4
- CUDA_Runtime_jll: 0.9.2+4

Toolchain:
- Julia: 1.9.4
- LLVM: 14.0.6
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5
- Device capability support: sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86

1 device:
  0: Tesla P100-PCIE-12GB (sm_60, 11.906 GiB / 12.000 GiB available)

julia> function khatrirao(A::Vararg{T,N}) where {T<:AbstractMatrix,N}
           r = size(A[1],2)
           # @boundscheck all(==(r),size.(A,2)) || throw(DimensionMismatch())
           R = ntuple(Val(N)) do k
               dims = (ntuple(i->1,Val(N-k))..., :, ntuple(i->1,Val(k-1))..., r)
               return reshape(A[k],dims)
           end
           return reshape(broadcast(*, R...),:,r)
       end
khatrirao (generic function with 1 method)

julia> A = CUDA.randn.((100, 100, 100), 500);

julia> typeof(A)
Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}

julia> A_cpu = Array.(A);

julia> typeof(A_cpu)
Tuple{Matrix{Float32}, Matrix{Float32}, Matrix{Float32}}

julia> A_cpu_64 = convert.(Matrix{Float64}, A_cpu);

julia> typeof(A_cpu_64)
Tuple{Matrix{Float64}, Matrix{Float64}, Matrix{Float64}}

julia> typeof(khatrirao(A...))
CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}

julia> typeof(khatrirao(A_cpu...))
Matrix{Float32} (alias for Array{Float32, 2})

julia> typeof(khatrirao(A_cpu_64...))
Matrix{Float64} (alias for Array{Float64, 2})

julia> Array(khatrirao(A...)) == khatrirao(A_cpu...) ≈ khatrirao(A_cpu_64...)
true

julia> CUDA.@time khatrirao(A...);
  0.065930 seconds (111 CPU allocations: 6.359 KiB) (1 GPU allocation: 1.863 GiB, 6.48% memmgmt time)

julia> @time khatrirao(A_cpu...);
  1.039305 seconds (18 allocations: 1.863 GiB, 4.55% gc time)

julia> @time khatrirao(A_cpu_64...);
  1.450372 seconds (21 allocations: 3.725 GiB, 0.75% gc time)

Looks promising!

alexmul1114 commented 9 months ago

Using the simple MTTKRP implementation in master with the Khatri-Rao function, can get a significant speedup using GPU for a tensor of size (100, 200, 300) (for all three modes of MTTKRP):

julia> using BenchmarkTools

julia> using CUDA

julia> CUDA.versioninfo()
CUDA runtime 12.2, artifact installation
CUDA driver 12.1
NVIDIA driver 530.30.2

CUDA libraries: 
- CUBLAS: 12.2.5
- CURAND: 10.3.3
- CUFFT: 11.0.8
- CUSOLVER: 11.5.2
- CUSPARSE: 12.1.2
- CUPTI: 20.0.0
- NVML: 12.0.0+530.30.2

Julia packages: 
- CUDA: 5.0.0
- CUDA_Driver_jll: 0.6.0+4
- CUDA_Runtime_jll: 0.9.2+4

Toolchain:
- Julia: 1.9.4
- LLVM: 14.0.6
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5
- Device capability support: sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86

1 device:
  0: Tesla P100-PCIE-12GB (sm_60, 11.334 GiB / 12.000 GiB available)

julia> function khatrirao(A::Vararg{T,N}) where {T<:AbstractMatrix,N}
           r = size(A[1],2)
           # @boundscheck all(==(r),size.(A,2)) || throw(DimensionMismatch())
           R = ntuple(Val(N)) do k
               dims = (ntuple(i->1,Val(N-k))..., :, ntuple(i->1,Val(k-1))..., r)
               return reshape(A[k],dims)
           end
           return reshape(broadcast(*, R...),:,r)
       end
khatrirao (generic function with 1 method)

julia> function mttkrp(X, U, n)
           N, I, r = length(U), Tuple(size.(U, 1)), (only∘unique)(size.(U, 2))
           (N == ndims(X) && I == size(X)) || throw(DimensionMismatch("`X` and `U` do not have matching dimensions"))

           # Matricized tensor (in mode n)
           Xn = reshape(permutedims(X, [n; setdiff(1:N, n)]), size(X, n), :)

           # Khatri-Rao product (in mode n)
           Zn = similar(U[1], prod(I[setdiff(1:N, n)]), r)
           Zn = khatrirao(U[reverse(setdiff(1:N, n))]...)

           # MTTKRP (in mode n)
           return Xn * Zn
       end
mttkrp (generic function with 1 method)

julia> X_gpu = CUDA.randn(100,200,300);

julia> r = 10;

julia> U_gpu = CUDA.rand.(size(X_gpu), r);

julia> X = Array(X_gpu);

julia> U = [Array(U_gpu[i]) for i in 1:ndims(X)];

julia> isapprox(mttkrp(X, U, 1), Array(mttkrp(X_gpu, U_gpu, 1)), atol=0)
true

julia> @btime mttkrp(X, U, 1);
  141.101 ms (57 allocations: 27.47 MiB)

julia> @btime CUDA.@sync mttkrp(X_gpu, U_gpu, 1);
  1.619 ms (248 allocations: 12.53 KiB)

julia> isapprox(mttkrp(X, U, 2), Array(mttkrp(X_gpu, U_gpu, 2)), atol=0)
true

julia> @btime mttkrp(X, U, 2);
  148.449 ms (57 allocations: 25.19 MiB)

julia> @btime CUDA.@sync mttkrp(X_gpu, U_gpu, 2);
  1.450 ms (248 allocations: 12.53 KiB)

julia> isapprox(mttkrp(X, U, 3), Array(mttkrp(X_gpu, U_gpu, 3)), atol=0)
true

julia> @btime mttkrp(X, U, 3);
  196.522 ms (57 allocations: 24.43 MiB)

julia> @btime CUDA.@sync mttkrp(X_gpu, U_gpu, 3);
  2.109 ms (248 allocations: 12.53 KiB)
alexmul1114 commented 9 months ago

Can get the new MTTKRP function from #35 to work on gpu with one modification (making a copy of the result of selectdim in the loop in the else case, which is necessary because selectdim returns a view which is not gpu-compatible):

function mttkrp(X, U, n)

    # Dimensions
    N, I, r = length(U), Tuple(size.(U, 1)), (only∘unique)(size.(U, 2))
    (N == ndims(X) && I == size(X)) || throw(DimensionMismatch("`X` and `U` do not have matching dimensions"))

    # See section III-B from "Fast Alternating LS Algorithms for High Order CANDECOMP/PARAFAC Tensor Factorizations" by Phan et al.
    Rn = similar(U[n])
    Jn = prod(size(X)[1:n])
    Kn = prod(size(X)[n+1:end])

    # Special cases are n = 1 and n = N (n = 1 has no outer tensor-vector products),
    # n = N has no inner tensor-vector products
    if n == 1
        # Just inner tensor-vector products
        kr_inner = khatrirao(U[reverse(2:N)]...)
        mul!(Rn, reshape(X, size(X, 1), :), kr_inner)
    elseif n == N
        # Just outer tensor-vector products
        kr_outer = khatrirao(U[reverse(1:N-1)]...)
        mul!(Rn, transpose(reshape(X, prod(size(X)[1:N-1]), size(X)[N])), kr_outer)
    else
        kr_inner = khatrirao(U[reverse(n+1:N)]...) 
        kr_outer = khatrirao(U[reverse(1:n-1)]...)
        inner = reshape(reshape(X, Jn, Kn) * kr_inner, (size(X)[1:n]..., r)) 
        Jn_inner = prod(size(inner)[1:n-1])
        Kn_inner = prod(size(inner)[n:end-1])
        for j in 1:r
            Rn[:, j] = transpose(reshape(copy(selectdim(inner, ndims(inner), j)), Jn_inner, Kn_inner)) * kr_outer[:, j]
        end
    end
    return Rn
end

Rewriting the loop in the else case to be one multiplication would make the code fully-compatible with CuArrays.