JuliaGPU / CUDA.jl

CUDA programming in Julia.
https://juliagpu.org/cuda/
Other
1.21k stars 221 forks source link

WMMA kernel works with Julia 1.7.2 but fails with `illegal memory access` for Julia 1.8.0-beta1 #1431

Closed carstenbauer closed 2 years ago

carstenbauer commented 2 years ago

Using the current CUDA.jl master (146ad00c0) the following kernel works with Julia 1.7.2 but fails with Julia 1.8.0-beta1.

function kernel_wmma_int8_lowlevel(a_dev, b_dev, c_dev, d_dev)
   a_frag = WMMA.llvm_wmma_load_a_col_m16n16k16_global_stride_s8(pointer(a_dev), 16)
   b_frag = WMMA.llvm_wmma_load_b_col_m16n16k16_global_stride_s8(pointer(b_dev), 16)
   c_frag = WMMA.llvm_wmma_load_c_col_m16n16k16_global_stride_s32(pointer(c_dev), 16)

   c_frag = WMMA.llvm_wmma_mma_col_col_m16n16k16_s8(a_frag, b_frag, c_frag)

   WMMA.llvm_wmma_store_d_col_m16n16k16_global_stride_s32(pointer(d_dev), c_frag, 16)
   return nothing
end

function call_kernel()
   m = n = 16
   k = 16
   dtype_a = dtype_b = Int8
   dtype_a = dtype_b = Int8
   dtype_a = dtype_b = Int8
   dtype_c = dtype_d = Int32

   d_a = CUDA.rand(dtype_a, m, k)
   d_b = CUDA.rand(dtype_b, k, n)
   d_c = CUDA.rand(dtype_c, m, n)
   d_d = CUDA.zeros(dtype_d, m, n)

   CUDA.@sync @cuda kernel_wmma_int8_lowlevel(d_a, d_b, d_c, d_d)
   return nothing
end

Error message for Julia 1.8.0-beta1:

julia> call_kernel()                                                                                                                                                                                                             [0/2762]
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)                                                                                                                                            
Stacktrace:                                                                                                                                                                                                                              
 [1] throw_api_error(res::CUDA.cudaError_enum)                                                                                                                                                                                           
   @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/error.jl:91                                                                                                                                     
 [2] isdone                                                                                                                                                                                                                              
   @ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/stream.jl:109 [inlined]                                                                                                                              
 [3] nonblocking_synchronize                                                                                                                                                                                                             
   @ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/stream.jl:139 [inlined]                                                                                                                              
 [4] nonblocking_synchronize                                                                                                                                                                                                             
   @ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/context.jl:325 [inlined]                                                                                                                             
 [5] device_synchronize()                                                                                                                                                                                                                
   @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/context.jl:319                                                                                                                                  
 [6] top-level scope                                                                                                                                                                                                                     
   @ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/initialization.jl:54                                                                                                                                         

caused by: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)                                                                                                                                        
Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)                                                                     
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/error.jl:91
  [2] isdone                                   
    @ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/stream.jl:109 [inlined]
  [3] nonblocking_synchronize
    @ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/stream.jl:139 [inlined]
  [4] nonblocking_synchronize
    @ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/context.jl:325 [inlined]
  [5] device_synchronize()   
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/context.jl:319     
  [6] CuModule(data::Vector{UInt8}, options::Dict{CUDA.CUjit_option_enum, Any})
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/module.jl:41  
  [7] CuModule                                                                                                      
    @ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/lib/cudadrv/module.jl:23 [inlined]
  [8] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}})
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/compiler/execution.jl:451
  [9] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/cache.jl:95        
 [10] cufunction(f::CUDA.var"#kernel#361", tt::Type{Tuple{CuDeviceMatrix{Int32, 1}, UInt32, UInt32}}; name::String, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/compiler/execution.jl:297
 [11] macro expansion                                                                                                                                                                                                                    
    @ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/compiler/execution.jl:102 [inlined]
 [12] rand!(rng::CUDA.RNG, A::CuArray{Int32, 2, CUDA.Mem.DeviceBuffer})
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/random.jl:60
 [13] rand!(A::CuArray{Int32, 2, CUDA.Mem.DeviceBuffer})
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/random.jl:259
 [14] rand(T::Type, dim1::Int64, dims::Int64)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/random.jl:273
 [15] call_kernel()
    @ Main ./REPL[3]:9
 [16] top-level scope
    @ REPL[4]:1
 [17] top-level scope
    @ /scratch/pc2-mitarbeiter/bauerc/devel/PC2GPUBenchmarks.jl/dev/CUDA/src/initialization.jl:52

Note that this is not specific to this very kernel. I've tested similar kernel for Float16 which also works just fine under 1.7.2 but fails with 1.8.0-beta1.

This might also be the reason why we see illegal memory access errors in #1426 and #1425. (It's curious though that #1419 work...)

maleadt commented 2 years ago

Not only WMMA, I think, I've seen plenty of illegal memory accesses on 1.8 which I haven't had the time to debug yet.

maleadt commented 2 years ago

Dup of https://github.com/JuliaGPU/GPUCompiler.jl/issues/309