JuliaGPU / CUDA.jl

CUDA programming in Julia.
https://juliagpu.org/cuda/
Other
1.16k stars 206 forks source link

Segfault during multiGPU tests #2377

Closed maleadt closed 1 month ago

maleadt commented 1 month ago

As seen on CI:

[940392] signal (11.1): Segmentation fault
in expression starting at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/test/core/pool.jl:3
unknown function (ip: 0x7f178961bac5)
unknown function (ip: 0x7f17897b79bb)
unknown function (ip: 0x7f17835f57a4)
unknown function (ip: 0x7f1783637ae3)
unknown function (ip: 0x7f178329bc3c)
unknown function (ip: 0x7f178329d238)
unknown function (ip: 0x7f178329b2c0)
cublasDestroy_v2 at /root/.cache/julia-buildkite-plugin/depots/3cc01fab-3357-4a7a-9294-cde2d3115a97/artifacts/10e364e8eb619d3f49a273c168dc1b8451dcf718/lib/libcublas.so (unknown line)
cublasXtDestroy at /root/.cache/julia-buildkite-plugin/depots/3cc01fab-3357-4a7a-9294-cde2d3115a97/artifacts/10e364e8eb619d3f49a273c168dc1b8451dcf718/lib/libcublas.so (unknown line)
macro expansion at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/lib/utils/call.jl:218 [inlined]
macro expansion at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/lib/cublas/libcublas.jl:5345 [inlined]
#1002 at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/lib/utils/call.jl:35 [inlined]
retry_reclaim at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/src/memory.jl:434 [inlined]
check at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/lib/cublas/libcublas.jl:24 [inlined]
cublasXtDestroy at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/lib/utils/call.jl:34 [inlined]
#1291 at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/lib/cublas/CUBLAS.jl:145
#context!#978 at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/lib/cudadrv/state.jl:168 [inlined]
context! at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/lib/cudadrv/state.jl:163 [inlined]
xt_handle_dtor at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/lib/cublas/CUBLAS.jl:144
unknown function (ip: 0x7f169a8657cc)
_jl_invoke at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:2895 [inlined]
ijl_apply_generic at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:3077
empty! at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/lib/utils/cache.jl:111
#6 at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/lib/utils/cache.jl:22
unknown function (ip: 0x7f169a8643a2)
_jl_invoke at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:2895 [inlined]
ijl_apply_generic at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:3077
retry_reclaim_slow at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/src/memory.jl:464
retry_reclaim at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/src/memory.jl:436 [inlined]
_pool_alloc at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/src/memory.jl:625 [inlined]
macro expansion at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/src/memory.jl:606 [inlined]
macro expansion at ./timing.jl:395 [inlined]
pool_alloc at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/src/memory.jl:605 [inlined]
CuArray at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/src/array.jl:74
CuArray at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/src/array.jl:128 [inlined]
CuArray at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/src/array.jl:146
unknown function (ip: 0x7f16ec162109)
_jl_invoke at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:2895 [inlined]
ijl_apply_generic at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:3077
jl_apply at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined]
do_call at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/interpreter.c:126
eval_value at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/interpreter.c:223
eval_stmt_value at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/interpreter.c:174 [inlined]
eval_body at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/interpreter.c:617
eval_body at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/interpreter.c:544
jl_interpret_toplevel_thunk at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/interpreter.c:775
jl_toplevel_eval_flex at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/toplevel.c:934
jl_toplevel_eval_flex at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/toplevel.c:877
ijl_toplevel_eval_in at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/toplevel.c:985
eval at ./boot.jl:385 [inlined]
include_string at ./loading.jl:2076
_jl_invoke at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:2895 [inlined]
ijl_apply_generic at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:3077
_include at ./loading.jl:2136
include at ./client.jl:489 [inlined]
#11 at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/test/runtests.jl:87 [inlined]
macro expansion at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/test/setup.jl:63 [inlined]
macro expansion at /root/.cache/julia-buildkite-plugin/julia_installs/bin/linux/x64/1.10/julia-1.10-latest-linux-x86_64/share/julia/stdlib/v1.10/Test/src/Test.jl:1577 [inlined]
macro expansion at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/test/setup.jl:63 [inlined]
macro expansion at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/src/utilities.jl:35 [inlined]
macro expansion at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/src/memory.jl:810 [inlined]
top-level scope at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/test/setup.jl:62
jl_toplevel_eval_flex at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/toplevel.c:925
ijl_toplevel_eval_in at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/toplevel.c:985
eval at ./boot.jl:385 [inlined]
runtests at /var/lib/buildkite-agent/builds/gpuci-16/julialang/cuda-dot-jl/test/setup.jl:74
_jl_invoke at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:2895 [inlined]
ijl_apply_generic at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:3077
jl_apply at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined]
jl_f__call_latest at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/builtins.c:812
_jl_invoke at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:2895 [inlined]
ijl_apply_generic at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:3077
jl_apply at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined]
do_apply at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/builtins.c:768
#invokelatest#2 at ./essentials.jl:892
_jl_invoke at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:2895 [inlined]
ijl_apply_generic at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:3077
jl_apply at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined]
do_apply at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/builtins.c:768
invokelatest at ./essentials.jl:889
_jl_invoke at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:2895 [inlined]
ijl_apply_generic at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:3077
jl_apply at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined]
do_apply at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/builtins.c:768
#110 at /root/.cache/julia-buildkite-plugin/julia_installs/bin/linux/x64/1.10/julia-1.10-latest-linux-x86_64/share/julia/stdlib/v1.10/Distributed/src/process_messages.jl:287
run_work_thunk at /root/.cache/julia-buildkite-plugin/julia_installs/bin/linux/x64/1.10/julia-1.10-latest-linux-x86_64/share/julia/stdlib/v1.10/Distributed/src/process_messages.jl:70
#109 at /root/.cache/julia-buildkite-plugin/julia_installs/bin/linux/x64/1.10/julia-1.10-latest-linux-x86_64/share/julia/stdlib/v1.10/Distributed/src/process_messages.jl:287
unknown function (ip: 0x7f179952e882)
_jl_invoke at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:2895 [inlined]
ijl_apply_generic at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/gf.c:3077
jl_apply at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined]
start_task at /cache/build/builder-amdci4-2/julialang/julia-release-1-dot-10/src/task.c:1238
Allocations: 897317867 (Pool: 896129678; Big: 1188189); GC: 543

core/pool                                  (3) |         failed at 2024-05-07T19:49:47.041