Closed vpuri3 closed 1 month ago
device = Lux.gpu_device() l = Dense(2, 2, tanh; allow_fast_activation = false) p, st = Lux.setup(rng, l) x = CUDA.ones(2, 5) l(x, p |> device, st |> device)[1] # Float32[-0.16319485 -0.16319485 … -0.16319485 -0.16319485; -0.7446033 -0.7446033 … -0.7446033 -0.7446033]
device = Lux.gpu_device() l = Dense(2, 2, tanh) p, st = Lux.setup(rng, l) x = CUDA.ones(2, 5) l(x, p |> device, st |> device)[1] # Error
ERROR: ArgumentError: Pass LowerSIMDLoop is not a module pass Stacktrace: [1] add!(pm::LLVM.NewPMModulePassManager, pb::LLVM.PassBuilder, pass::LLVM.Interop.LowerSIMDLoopPass) @ LLVM ~/.julia/packages/LLVM/ShACK/src/newpm/passes.jl:701 [2] add!(pm::LLVM.NewPMModulePassManager, pass::LLVM.Interop.LowerSIMDLoopPass) @ LLVM ~/.julia/packages/LLVM/ShACK/src/newpm/passes.jl:728 [3] buildNewPMPipeline!(mpm::LLVM.NewPMModulePassManager, job::GPUCompiler.CompilerJob, opt_level::Int64) @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/optim.jl:37 [4] buildNewPMPipeline!(mpm::LLVM.NewPMModulePassManager, job::GPUCompiler.CompilerJob) @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/optim.jl:34 [5] macro expansion @ ~/.julia/packages/GPUCompiler/kqxyC/src/optim.jl:24 [inlined] [6] macro expansion @ ~/.julia/packages/LLVM/ShACK/src/base.jl:98 [inlined] [7] macro expansion @ ~/.julia/packages/GPUCompiler/kqxyC/src/optim.jl:23 [inlined] [8] macro expansion @ ~/.julia/packages/LLVM/ShACK/src/base.jl:98 [inlined] [9] optimize_newpm!(job::GPUCompiler.CompilerJob, mod::LLVM.Module) @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/optim.jl:22 [10] optimize!(job::GPUCompiler.CompilerJob, mod::LLVM.Module) @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/optim.jl:5 [11] macro expansion @ ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:344 [inlined] [12] macro expansion @ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined] [13] macro expansion @ ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:343 [inlined] [14] macro expansion @ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined] [15] macro expansion @ ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:316 [inlined] [16] emit_llvm(job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool) @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/utils.jl:92 [17] emit_llvm @ ~/.julia/packages/GPUCompiler/kqxyC/src/utils.jl:86 [inlined] [18] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing) @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:134 [19] codegen @ ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:115 [inlined] [20] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool) @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:111 [21] compile @ ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:103 [inlined] [22] #1116 @ ~/.julia/packages/CUDA/XUdwt/src/compiler/compilation.jl:247 [inlined] [23] JuliaContext(f::CUDA.var"#1116#1119"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{}) @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:52 [24] JuliaContext(f::Function) @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:42 [25] compile(job::GPUCompiler.CompilerJob) @ CUDA ~/.julia/packages/CUDA/XUdwt/src/compiler/compilation.jl:246 [26] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/execution.jl:128 [27] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function) @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/execution.jl:103 [28] macro expansion @ ~/.julia/packages/CUDA/XUdwt/src/compiler/execution.jl:367 [inlined] [29] macro expansion @ ./lock.jl:267 [inlined] [30] cufunction(f::GPUArrays.var"#35#37", tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceMatrix{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{…}, Tuple{…}, typeof(tanh_fast), Tuple{…}}, Int64}}; kwargs::@Kwargs{}) @ CUDA ~/.julia/packages/CUDA/XUdwt/src/compiler/execution.jl:362 [34] launch_heuristic @ ~/.julia/packages/CUDA/XUdwt/src/gpuarrays.jl:15 [inlined] [35] _copyto! @ ~/.julia/packages/GPUArrays/OqrUV/src/host/broadcast.jl:78 [inlined] [36] materialize! @ ~/.julia/packages/GPUArrays/OqrUV/src/host/broadcast.jl:38 [inlined] [37] materialize! @ ./broadcast.jl:911 [inlined] [38] _cublaslt_matmul_fused!(transy::Bool, y::CuArray{…}, σ::typeof(tanh_fast), transw::Bool, w::CuArray{…}, transx::Bool, x::CuArray{…}, b::CuArray{…}, aux::Nothing) @ LuxLibCUDAExt ~/.julia/packages/LuxLib/VRICL/ext/LuxLibCUDAExt/cublaslt.jl:141 [39] _cublaslt_matmul_fused! @ ~/.julia/packages/LuxLib/VRICL/ext/LuxLibCUDAExt/cublaslt.jl:13 [inlined] [40] _cublaslt_matmul_fused! @ ~/.julia/packages/LuxLib/VRICL/ext/LuxLibCUDAExt/cublaslt.jl:10 [inlined] [41] __fused_dense_bias_activation_impl(act::typeof(tanh_fast), weight::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, x::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, b::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) @ LuxLibCUDAExt ~/.julia/packages/LuxLib/VRICL/ext/LuxLibCUDAExt/fused_dense.jl:15 [42] fused_dense_bias_activation @ ~/.julia/packages/LuxLib/VRICL/src/api/dense.jl:46 [inlined] [43] fused_dense_bias_activation @ ~/.julia/packages/LuxLib/VRICL/src/api/dense.jl:38 [inlined] [44] (::Dense{true, typeof(tanh_fast), typeof(glorot_uniform), typeof(zeros32)})(x::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, ps::@NamedTuple{weight::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, bias::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, st::@NamedTuple{}) @ Lux ~/.julia/packages/Lux/PsbZF/src/layers/basic.jl:218 [45] top-level scope @ REPL[13]:1 [46] top-level scope @ ~/.julia/packages/CUDA/XUdwt/src/initialization.jl:209 Some type information was truncated. Use `show(err)` to see complete types.
(test) pkg> st Status `~/.julia/dev/KolmogorovArnold.jl/test/Project.toml` [6e4b80f9] BenchmarkTools v1.5.0 [052768ef] CUDA v5.3.4 [b0b7db55] ComponentArrays v0.15.13 [b2108857] Lux v0.5.51 [d0bbae9a] LuxCUDA v0.3.2 [34f89e08] LuxDeviceUtils v0.1.20 [eb30cadb] MLDatasets v0.7.14 [f1d291b0] MLUtils v0.4.4 [3bd65402] Optimisers v0.3.3 [36348300] OptimizationOptimJL v0.3.1 [91a5bcdd] Plots v1.40.4 [e88e6eb3] Zygote v0.6.70 [02a925ec] cuDNN v1.3.1
My apologies, this was due to an issue with my environment (I was using LLVM 7 in place of 6).