LuxDL / Lux.jl

Elegant & Performant Scientific Machine Learning in Julia
https://lux.csail.mit.edu/
MIT License
446 stars 50 forks source link

Using `swish` as `Conv` activation function errors on the GPU #662

Closed Sleort closed 1 month ago

Sleort commented 1 month ago

In short:

using Lux, LuxCUDA, Random;
rng = Random.default_rng()
gpu = gpu_device()
x_cpu = rand(Float32, 5,5,1,1)
x_gpu = gpu(x_cpu)

# No errors on the CPU for all NNlib activation functions:
for act ∈ [
    σ, hardσ, hardtanh, relu,
    leakyrelu, relu6, rrelu, elu, gelu, swish, hardswish, selu,
    celu, softplus, softsign, logσ, logcosh,
    mish, tanhshrink, softshrink, trelu, lisht,
    tanh_fast, sigmoid_fast
    ]
    model = Conv((3,3), 1 => 1, act)
    ps, st = Lux.setup(rng, model)
    model(x_cpu, ps, st)
end

# No errors on the GPU ...
for act ∈ [
    σ, hardσ, hardtanh, relu,
    leakyrelu, relu6, rrelu, elu, gelu, hardswish, selu,
    celu, softplus, softsign, logσ, logcosh,
    mish, tanhshrink, softshrink, trelu, lisht,
    tanh_fast, sigmoid_fast
    ]
    model = Conv((3,3), 1 => 1, act)
    ps, st = Lux.setup(rng, model) |> gpu
    model(x_gpu, ps, st)
end

# ... except for when using swish!
act = swish
model = Conv((3,3), 1 => 1, act)
ps, st = Lux.setup(rng, model) |> gpu
model(x_gpu, ps, st)

The swish example returns the error message

ERROR: InvalidIRError: compiling MethodInstance for (::GPUArrays.var"#35#37")(::CUDA.CuKernelContext, ::CuDeviceArray{Float32, 4, 1}, ::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{…}, NTuple{…}, ComposedFunction{…}, Tuple{…}}, ::Int64) resulted in invalid LLVM IR
Reason: unsupported call to an unknown function (call to julia.new_gc_frame)
Reason: unsupported call to an unknown function (call to julia.push_gc_frame)
Reason: unsupported call to an unknown function (call to julia.pop_gc_frame)
Reason: unsupported call to an unknown function (call to julia.get_gc_frame_slot)
Reason: unsupported dynamic function invocation (call to var"#_#103"(kw::Base.Pairs{Symbol, V, Tuple{Vararg{Symbol, N}}, NamedTuple{names, T}} where {V, N, names, T<:Tuple{Vararg{Any, N}}}, c::ComposedFunction, x...) @ Base operators.jl:1041)
Stacktrace:
 [1] ComposedFunction
   @ ./operators.jl:1041
 [2] _broadcast_getindex_evalf
   @ ./broadcast.jl:709
 [3] _broadcast_getindex
   @ ./broadcast.jl:682
 [4] getindex
   @ ./broadcast.jl:636
 [5] #35
   @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:70
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/validation.jl:147
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:445 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/Lw5SP/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:444 [inlined]
  [5] emit_llvm(job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/utils.jl:92
  [6] emit_llvm
    @ ~/.julia/packages/GPUCompiler/kqxyC/src/utils.jl:86 [inlined]
  [7] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:134
  [8] codegen
    @ ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:115 [inlined]
  [9] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:111
 [10] compile
    @ ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:103 [inlined]
 [11] #1116
    @ ~/.julia/packages/CUDA/XUdwt/src/compiler/compilation.jl:247 [inlined]
 [12] JuliaContext(f::CUDA.var"#1116#1119"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:52
 [13] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/driver.jl:42
 [14] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/XUdwt/src/compiler/compilation.jl:246
 [15] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/execution.jl:128
 [16] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kqxyC/src/execution.jl:103
 [17] macro expansion
    @ ~/.julia/packages/CUDA/XUdwt/src/compiler/execution.jl:367 [inlined]
 [18] macro expansion
    @ ./lock.jl:267 [inlined]
 [19] cufunction(f::GPUArrays.var"#35#37", tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceArray{…}, Base.Broadcast.Broadcasted{…}, Int64}}; kwargs::@Kwargs{})
    @ CUDA ~/.julia/packages/CUDA/XUdwt/src/compiler/execution.jl:362
 [20] cufunction
    @ ~/.julia/packages/CUDA/XUdwt/src/compiler/execution.jl:359 [inlined]
 [21] macro expansion
    @ ~/.julia/packages/CUDA/XUdwt/src/compiler/execution.jl:112 [inlined]
 [22] #launch_heuristic#1173
    @ ~/.julia/packages/CUDA/XUdwt/src/gpuarrays.jl:17 [inlined]
 [23] launch_heuristic
    @ ~/.julia/packages/CUDA/XUdwt/src/gpuarrays.jl:15 [inlined]
 [24] _copyto!
    @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:78 [inlined]
 [25] materialize!
    @ ~/.julia/packages/GPUArrays/OKkAu/src/host/broadcast.jl:38 [inlined]
 [26] materialize!
    @ ./broadcast.jl:911 [inlined]
 [27] __nonuniform_fast_broadcast!
    @ ~/.julia/packages/LuxLib/GNpB5/src/utils.jl:136 [inlined]
 [28] __apply_bias_activation!!
    @ ~/.julia/packages/LuxLib/GNpB5/src/utils.jl:100 [inlined]
 [29] __conv_bias_act_impl
    @ ~/.julia/packages/LuxLib/GNpB5/src/impl/fused_conv.jl:74 [inlined]
 [30] __conv_bias_act
    @ ~/.julia/packages/LuxLib/GNpB5/src/impl/fused_conv.jl:57 [inlined]
 [31] __fused_conv_bias_activation_impl
    @ ~/.julia/packages/LuxLib/GNpB5/src/impl/fused_conv.jl:107 [inlined]
 [32] _fused_conv_bias_activation_impl
    @ ~/.julia/packages/LuxLib/GNpB5/src/impl/fused_conv.jl:99 [inlined]
 [33] fused_conv_bias_activation
    @ ~/.julia/packages/LuxLib/GNpB5/src/api/conv.jl:49 [inlined]
 [34] fused_conv_bias_activation
    @ ~/.julia/packages/LuxLib/GNpB5/src/api/conv.jl:33 [inlined]
 [35] (::Conv{…})(x::CuArray{…}, ps::@NamedTuple{…}, st::@NamedTuple{})
    @ Lux ~/.julia/packages/Lux/PsbZF/src/layers/conv.jl:119
 [36] top-level scope
    @ REPL[14]:1
 [37] top-level scope
    @ ~/.julia/packages/CUDA/XUdwt/src/initialization.jl:209
Some type information was truncated. Use `show(err)` to see complete types.

I'm on Julia v1.10.3, using Lux v0.5.51 and LuxCUDA v0.3.2

avik-pal commented 1 month ago

Yeah it is using sigmoid_fast, which causes in place broadcasting to fail, I will patch this in LuxLib