JeffFessler / mirt-demo

Demonstration notebooks for MIRT.jl
MIT License
12 stars 6 forks source link

mirt-demo on GPU #5

Open pkash16 opened 2 years ago

pkash16 commented 2 years ago

Not sure if this is an issue with my implementation, this demo, MIRT.jl, or LinearMapsAA.jl, so I'm putting this issue here. Let me know if there is a better place for this.

It seems like from this discussion #https://github.com/JeffFessler/MIRT.jl/discussions/105 the goal is to have MIRT.jl "just work" with the GPU as everything is implemented with AbstractArray types. I am running this mirt-demo for "temporal finite differences" reconstruction of dynamic MRI data.

To this demo I added these lines:

kspace = CuArray(kspace)
smap = CuArray(smap) 
xtrue = CuArray(xtrue)

Constructing the A matrix receives no errors. But doing the multiplication on the next line A * xtrue does create the following error message "This object is not a GPU array", which seems to stem from an unhappy multiplication in the LinearMapsAA package. The error message is hard for me to parse so if you happen to be familiar with this and have any hints, I am happy to dig deeper and figure out what is going on since I have a GPU set up on my end, but currently am stumped parsing this error message.

ERROR: This object is not a GPU array
Stacktrace:
  [1] error(s::String)
    @ Base ./error.jl:33
  [2] backend(#unused#::Type)
    @ GPUArrays ~/.julia/packages/GPUArrays/Zecv7/src/device/execution.jl:15
  [3] backend(x::Vector{ComplexF32})
    @ GPUArrays ~/.julia/packages/GPUArrays/Zecv7/src/device/execution.jl:16
  [4] _copyto!
    @ ~/.julia/packages/GPUArrays/Zecv7/src/host/broadcast.jl:73 [inlined]
  [5] materialize!
    @ ~/.julia/packages/GPUArrays/Zecv7/src/host/broadcast.jl:51 [inlined]
  [6] materialize!
    @ ./broadcast.jl:891 [inlined]
  [7] mul!
    @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/diagonal.jl:316 [inlined]
  [8] mul!
    @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/LinearAlgebra/src/matmul.jl:275 [inlined]
  [9] _unsafe_mul!(y::Vector{ComplexF32}, A::Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}, x::CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer})
    @ LinearMaps ~/.julia/packages/LinearMaps/NF1pj/src/LinearMaps.jl:246
 [10] _unsafe_mul!
    @ ~/.julia/packages/LinearMaps/NF1pj/src/wrappedmap.jl:70 [inlined]
 [11] _compositemul!(y::SubArray{ComplexF32, 1, Matrix{ComplexF32}, Tuple{UnitRange{Int64}, Int64}, true}, A::LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}, x::CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}, source::Vector{ComplexF32}, dest::Nothing) (repeats 2 times)
    @ LinearMaps ~/.julia/packages/LinearMaps/NF1pj/src/composition.jl:185
 [12] _unsafe_mul!(y::SubArray{ComplexF32, 1, Matrix{ComplexF32}, Tuple{UnitRange{Int64}, Int64}, true}, A::LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}, x::CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer})
    @ LinearMaps ~/.julia/packages/LinearMaps/NF1pj/src/composition.jl:168
 [13] mul!(y::SubArray{ComplexF32, 1, Matrix{ComplexF32}, Tuple{UnitRange{Int64}, Int64}, true}, A::LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}, x::CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer})
    @ LinearMaps ~/.julia/packages/LinearMaps/NF1pj/src/LinearMaps.jl:163
 [14] _generic_mapvec_mul!(y::SubArray{ComplexF32, 1, Matrix{ComplexF32}, Tuple{UnitRange{Int64}, Int64}, true}, A::LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}, x::CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}, α::Int64, β::Int64)
    @ LinearMaps ~/.julia/packages/LinearMaps/NF1pj/src/LinearMaps.jl:204
 [15] _unsafe_mul!
    @ ~/.julia/packages/LinearMaps/NF1pj/src/LinearMaps.jl:249 [inlined]
 [16] ___blockmul!(y::SubArray{ComplexF32, 1, Matrix{ComplexF32}, Tuple{UnitRange{Int64}, Int64}, true}, A::LinearMaps.BlockMap{ComplexF32, Tuple{LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}, LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}}, Tuple{Int64, Int64}}, x::CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}, α::Int64, β::Int64, z::Vector{ComplexF32})
    @ LinearMaps ~/.julia/packages/LinearMaps/NF1pj/src/blockmap.jl:325
 [17] __blockmul!
    @ ~/.julia/packages/LinearMaps/NF1pj/src/blockmap.jl:297 [inlined]
 [18] _blockmul!(y::SubArray{ComplexF32, 1, Matrix{ComplexF32}, Tuple{UnitRange{Int64}, Int64}, true}, A::LinearMaps.BlockMap{ComplexF32, Tuple{LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}, LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}}, Tuple{Int64, Int64}}, x::CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}, α::Int64, β::Int64)
    @ LinearMaps ~/.julia/packages/LinearMaps/NF1pj/src/blockmap.jl:292
 [19] _unsafe_mul!
    @ ~/.julia/packages/LinearMaps/NF1pj/src/blockmap.jl:377 [inlined]
 [20] _blockscaling!(y::SubArray{ComplexF32, 1, Matrix{ComplexF32}, Tuple{Base.Slice{Base.OneTo{Int64}}, Int64}, true}, A::LinearMaps.BlockDiagonalMap{ComplexF32, NTuple{8, LinearMaps.BlockMap{ComplexF32, Tuple{LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}, LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}}, Tuple{Int64, Int64}}}}, x::CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}, α::Int64, β::Int64)
    @ LinearMaps ~/.julia/packages/LinearMaps/NF1pj/src/blockmap.jl:498
 [21] _unsafe_mul!
    @ ~/.julia/packages/LinearMaps/NF1pj/src/blockmap.jl:489 [inlined]
 [22] mul!(y::SubArray{ComplexF32, 1, Matrix{ComplexF32}, Tuple{Base.Slice{Base.OneTo{Int64}}, Int64}, true}, A::LinearMaps.BlockDiagonalMap{ComplexF32, NTuple{8, LinearMaps.BlockMap{ComplexF32, Tuple{LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}, LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}}, Tuple{Int64, Int64}}}}, x::CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}, α::Int64, β::Int64)
    @ LinearMaps ~/.julia/packages/LinearMaps/NF1pj/src/LinearMaps.jl:198
 [23] lm_mul!
    @ ~/.julia/packages/LinearMapsAA/iTSed/src/multiply.jl:88 [inlined]
 [24] lmao_mul!(Y::Array{ComplexF32, 3}, Lm::LinearMaps.BlockDiagonalMap{ComplexF32, NTuple{8, LinearMaps.BlockMap{ComplexF32, Tuple{LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}, LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}}, Tuple{Int64, Int64}}}}, X::CuArray{ComplexF32, 3, CUDA.Mem.DeviceBuffer}, α::Int64, β::Int64; idim::Tuple{Int64, Int64, Int64}, odim::Tuple{Int64, Int64, Int64})
    @ LinearMapsAA ~/.julia/packages/LinearMapsAA/iTSed/src/multiply.jl:153
 [25] lmax_mul(A::LinearMapAO{ComplexF32, 3, 3, LinearMaps.BlockDiagonalMap{ComplexF32, NTuple{8, LinearMaps.BlockMap{ComplexF32, Tuple{LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}, LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}}, Tuple{Int64, Int64}}}}, NamedTuple{(:nblock,), Tuple{Int64}}}, X::CuArray{ComplexF32, 3, CUDA.Mem.DeviceBuffer})
    @ LinearMapsAA ~/.julia/packages/LinearMapsAA/iTSed/src/multiply.jl:182
 [26] *(A::LinearMapAO{ComplexF32, 3, 3, LinearMaps.BlockDiagonalMap{ComplexF32, NTuple{8, LinearMaps.BlockMap{ComplexF32, Tuple{LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}, LinearMaps.CompositeMap{ComplexF32, Tuple{LinearMaps.WrappedMap{ComplexF32, Diagonal{ComplexF32, CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}}}, LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#587#593"{UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#588#594"{UnionAll}}}}}}, Tuple{Int64, Int64}}}}, NamedTuple{(:nblock,), Tuple{Int64}}}, X::CuArray{ComplexF32, 3, CUDA.Mem.DeviceBuffer})
    @ LinearMapsAA ~/.julia/packages/LinearMapsAA/iTSed/src/multiply.jl:161
 [27] top-level scope
    @ REPL[51]:1

p.s. we have a similar reconstruction at USC for dynamic speech using acquired spiral sampled data in MATLAB + GPU, and I am very curious to see how Julia + GPU performs hence the interest in this demo!

JeffFessler commented 2 years ago

Funny timing because just yesterday I fixed an issue with LinearMapsAA.jl to enable CUDA there: https://github.com/JeffFessler/LinearMapsAA.jl/issues/43 Now I probably need to update the Manifest in this repo to use the latest version of that package.

There are multiple demos in this repo, so please let me know which one you were trying, and I'll look into it. In the short run, you could try removing the Manifest.toml, then doing ]update to get the latest of everything. But probably I will need to fix a few things to get this to work. I am committed to making all this work with CUDA, but I am traveling for 2 weeks so it might take a bit of time...

pkash16 commented 2 years ago

I was looking at this demo in particular: mri-sim-2d+t.jl. A few modifications were made to make things up-to-date with deprecations to MIRT (moving to MIRTjim).

Updating does not seem to fix this issue, but I think the complexity of the demo makes it hard to debug in itself. I created a smaller example that throws a different error when using GPU, which could be useful to track down bugs when you get the time.

using MIRT:nufft_init, ir_mri_kspace_ga_radial
using ImagePhantoms:shepp_logan
using CUDA
using LinearMapsAA

N = (256, 256)
image = shepp_logan(N...)
Nro = 256
Nspoke = 256

# CPU Test
kspace = ir_mri_kspace_ga_radial(Nro = Nro, Nspoke = Nspoke)
p = nufft_init([kspace[:,:,1][:] kspace[:,:,2][:]], N)
p.A * image # works

# GPU Test
image = CuArray(image)
kspace = CuArray(kspace)
p = nufft_init([kspace[:,:,1][:] kspace[:,:,2][:]], t_N)
p.A * image # error 
ERROR: GPU compilation of kernel #broadcast_kernel#17(CUDA.CuKernelContext, CuDeviceVector{ComplexF32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(*), Tuple{Base.Broadcast.Extruded{CuDeviceVector{ComplexF32, 1}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Extruded{Vector{ComplexF32}, Tuple{Bool}, Tuple{Int64}}}}, Int64) failed
KernelError: passing and using non-bitstype argument

Argument 4 to your kernel function is of type Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(*), Tuple{Base.Broadcast.Extruded{CuDeviceVector{ComplexF32, 1}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Extruded{Vector{ComplexF32}, Tuple{Bool}, Tuple{Int64}}}}, which is not isbits:
  .args is of type Tuple{Base.Broadcast.Extruded{CuDeviceVector{ComplexF32, 1}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Extruded{Vector{ComplexF32}, Tuple{Bool}, Tuple{Int64}}} which is not isbits.
    .2 is of type Base.Broadcast.Extruded{Vector{ComplexF32}, Tuple{Bool}, Tuple{Int64}} which is not isbits.
      .x is of type Vector{ComplexF32} which is not isbits.

Stacktrace:
  [1] check_invocation(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/XyxTy/src/validation.jl:86
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/XyxTy/src/driver.jl:408 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/LDL7n/src/TimerOutput.jl:252 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/XyxTy/src/driver.jl:407 [inlined]
  [5] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/XyxTy/src/utils.jl:64
  [6] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
    @ CUDA ~/.julia/packages/CUDA/fAEDi/src/compiler/execution.jl:354
  [7] #256
    @ ~/.julia/packages/CUDA/fAEDi/src/compiler/execution.jl:347 [inlined]
  [8] JuliaContext(f::CUDA.var"#256#257"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{GPUArrays.var"#broadcast_kernel#17", Tuple{CUDA.CuKernelContext, CuDeviceVector{ComplexF32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(*), Tuple{Base.Broadcast.Extruded{CuDeviceVector{ComplexF32, 1}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Extruded{Vector{ComplexF32}, Tuple{Bool}, Tuple{Int64}}}}, Int64}}}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/XyxTy/src/driver.jl:74
  [9] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/fAEDi/src/compiler/execution.jl:346
 [10] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/XyxTy/src/cache.jl:90
 [11] cufunction(f::GPUArrays.var"#broadcast_kernel#17", tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceVector{ComplexF32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(*), Tuple{Base.Broadcast.Extruded{CuDeviceVector{ComplexF32, 1}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Extruded{Vector{ComplexF32}, Tuple{Bool}, Tuple{Int64}}}}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA ~/.julia/packages/CUDA/fAEDi/src/compiler/execution.jl:299
 [12] cufunction
    @ ~/.julia/packages/CUDA/fAEDi/src/compiler/execution.jl:293 [inlined]
 [13] macro expansion
    @ ~/.julia/packages/CUDA/fAEDi/src/compiler/execution.jl:102 [inlined]
 [14] #launch_heuristic#280
    @ ~/.julia/packages/CUDA/fAEDi/src/gpuarrays.jl:17 [inlined]
 [15] _copyto!
    @ ~/.julia/packages/GPUArrays/Zecv7/src/host/broadcast.jl:73 [inlined]
 [16] copyto!
    @ ~/.julia/packages/GPUArrays/Zecv7/src/host/broadcast.jl:56 [inlined]
 [17] copy
    @ ~/.julia/packages/GPUArrays/Zecv7/src/host/broadcast.jl:47 [inlined]
 [18] materialize(bc::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(*), Tuple{CuArray{ComplexF32, 1, CUDA.Mem.DeviceBuffer}, Vector{ComplexF32}}})
    @ Base.Broadcast ./broadcast.jl:883
 [19] (::MIRT.var"#437#443"{Vector{ComplexF32}, NFFT.NFFTPlan{2, 0, Float32}, UnionAll})(x::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer})
    @ MIRT ~/.julia/packages/MIRT/I8xsd/src/nufft/nufft.jl:188
 [20] (::LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#437#443"{Vector{ComplexF32}, NFFT.NFFTPlan{2, 0, Float32}, UnionAll}})(x::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer})
    @ LinearMapsAA ~/.julia/packages/LinearMapsAA/6uvUQ/src/types.jl:141
 [21] _unsafe_mul!(y::SubArray{ComplexF32, 1, Matrix{ComplexF32}, Tuple{Base.Slice{Base.OneTo{Int64}}, Int64}, true}, A::LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#437#443"{Vector{ComplexF32}, NFFT.NFFTPlan{2, 0, Float32}, UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#438#444"{Vector{ComplexF32}, NFFT.NFFTPlan{2, 0, Float32}, UnionAll}}}, x::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer})
    @ LinearMaps ~/.julia/packages/LinearMaps/NF1pj/src/functionmap.jl:114
 [22] mul!
    @ ~/.julia/packages/LinearMaps/NF1pj/src/LinearMaps.jl:163 [inlined]
 [23] _generic_mapvec_mul!(y::SubArray{ComplexF32, 1, Matrix{ComplexF32}, Tuple{Base.Slice{Base.OneTo{Int64}}, Int64}, true}, A::LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#437#443"{Vector{ComplexF32}, NFFT.NFFTPlan{2, 0, Float32}, UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#438#444"{Vector{ComplexF32}, NFFT.NFFTPlan{2, 0, Float32}, UnionAll}}}, x::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, α::Int64, β::Int64)
    @ LinearMaps ~/.julia/packages/LinearMaps/NF1pj/src/LinearMaps.jl:204
 [24] _unsafe_mul!
    @ ~/.julia/packages/LinearMaps/NF1pj/src/LinearMaps.jl:249 [inlined]
 [25] mul!
    @ ~/.julia/packages/LinearMaps/NF1pj/src/LinearMaps.jl:198 [inlined]
 [26] lm_mul!
    @ ~/.julia/packages/LinearMapsAA/6uvUQ/src/multiply.jl:88 [inlined]
 [27] lmao_mul!(Y::Vector{ComplexF32}, Lm::LinearMaps.FunctionMap{ComplexF32, LinearMapsAA.var"#6#10"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#437#443"{Vector{ComplexF32}, NFFT.NFFTPlan{2, 0, Float32}, UnionAll}}, LinearMapsAA.var"#8#12"{Tuple{Int64, Int64}, Tuple{Int64}, MIRT.var"#438#444"{Vector{ComplexF32}, NFFT.NFFTPlan{2, 0, Float32}, UnionAll}}}, X::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}, α::Int64, β::Int64; idim::Tuple{Int64, Int64}, odim::Tuple{Int64})
    @ LinearMapsAA ~/.julia/packages/LinearMapsAA/6uvUQ/src/multiply.jl:153
 [28] lmax_mul(A::LinearMapAO{ComplexF32, 1, 2}, X::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer})
    @ LinearMapsAA ~/.julia/packages/LinearMapsAA/6uvUQ/src/multiply.jl:182
 [29] *(A::LinearMapAO{ComplexF32, 1, 2}, X::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer})
    @ LinearMapsAA ~/.julia/packages/LinearMapsAA/6uvUQ/src/multiply.jl:161
 [30] top-level scope
    @ ~/development/mirt-gpu-tests/test.jl:20
JeffFessler commented 2 years ago

@pkash16 FYI I am working on this... Currently I am waiting for an update to NFFT.jl for CUDA. You are on the bleeding edge 😄