gfx-rs / wgpu

A cross-platform, safe, pure-Rust graphics API.
https://wgpu.rs
Apache License 2.0
12.62k stars 924 forks source link

Intermittent GL/AMD driver segfault at the end of life_cycle::buffer_destroy test on linux #4983

Open nical opened 10 months ago

nical commented 10 months ago

Description

I don't expect this to be specific to the buffer_destroy test but that's the one I've been using to reproduce and investigate the issue.

At the end of the test, always after having dropped the devices (and the AdapterShared), I sometimes get a segfault in one of the driver's threads

Backtrace of the crash (note the glow debug message callback part and other indication that we are running GL related code):

#0  0x00000000000001e0 in ?? ()
#1  0x0000555556531744 in alloc::boxed::{impl#48}::call_mut<(u32, u32, u32, u32, &str), dyn core::ops::function::FnMut<(u32, u32, u32, u32, &str), Output=()>, alloc::alloc::Global> (self=0x555556bedbe0, args=...)
    at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/alloc/src/boxed.rs:1992
#2  0x000055555652af1f in glow::native::raw_debug_message_callback::{closure#0} () at src/native.rs:3215
#3  0x00005555565246a5 in std::panicking::try::do_call<glow::native::raw_debug_message_callback::{closure_env#0}, ()> (data=0x7fffd1ffd7c0) at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/std/src/panicking.rs:500
#4  0x0000555556524c8b in __rust_try ()
#5  0x000055555652461f in std::panicking::try<(), glow::native::raw_debug_message_callback::{closure_env#0}> (f=...) at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/std/src/panicking.rs:464
#6  0x0000555556530b4a in std::panic::catch_unwind<glow::native::raw_debug_message_callback::{closure_env#0}, ()> (f=...) at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/std/src/panic.rs:142
#7  0x000055555652ae30 in glow::native::raw_debug_message_callback (source=33352, gltype=33361, id=1, severity=33387, length=201, message=0x7fffd1ffd8e0, user_param=0x555556bedbe0) at src/native.rs:3211
#8  0x00007fffd230a6b3 in _mesa_gl_vdebugf (ctx=0x555556ecf970, id=0x7fffd3d3c6c0 <id>, source=MESA_DEBUG_SOURCE_SHADER_COMPILER, type=MESA_DEBUG_TYPE_OTHER, severity=MESA_DEBUG_SEVERITY_NOTIFICATION, 
    fmtString=0x7fffd3665278 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d Spilled VGPRs: %d PrivMem VGPRs: %d Outputs: %u PatchOutputs: %u DivergentLoop: %d InlineUniforms: %"..., 
    args=0x7fffd1ffe950) at ../src/mesa/main/errors.c:210
#9  0x00007fffd22a3c28 in _debug_message (data=<optimized out>, id=<optimized out>, ptype=<optimized out>, fmt=<optimized out>, args=<optimized out>) at ../src/mesa/main/debug_output.c:738
#10 0x00007fffd20c506e in _util_debug_message (cb=cb@entry=0x7fffcc04fd70, id=id@entry=0x7fffd3d3c6c0 <id>, type=type@entry=UTIL_DEBUG_TYPE_SHADER_INFO, 
    fmt=fmt@entry=0x7fffd3665278 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d Spilled VGPRs: %d PrivMem VGPRs: %d Outputs: %u PatchOutputs: %u DivergentLoop: %d InlineUniforms: %"...) at ../src/util/u_debug.c:74
#11 0x00007fffd28c8ca6 in si_shader_dump_stats_for_shader_db (screen=<optimized out>, shader=0x7fffb006ff80, debug=0x7fffcc04fd70) at ../src/gallium/drivers/radeonsi/si_shader.c:870
#12 0x00007fffd28f05d2 in si_init_shader_selector_async (job=0x7fffcc04fd40, gdata=<optimized out>, thread_index=<optimized out>) at ../src/gallium/drivers/radeonsi/si_state_shaders.cpp:2993
#13 0x00007fffd20c3d31 in util_queue_thread_func (input=input@entry=0x7fffcc0502d0) at ../src/util/u_queue.c:309
#14 0x00007fffd2112ccc in impl_thrd_routine (p=<optimized out>) at ../src/c11/impl/threads_posix.c:67
#15 0x00007ffff7aae947 in start_thread () from /lib64/libc.so.6

The interesting part is the main thread of the test in which we are unloading a dll while destroying... the vulkan instance.

#0  0x00007ffff7fcb1de in _dl_close_worker (map=<optimized out>, map@entry=0x555556b22680, force=force@entry=false) at dl-close.c:741
#1  0x00007ffff7fcb69b in _dl_close (_map=0x555556b22680) at dl-close.c:793
#2  0x00007ffff7fca523 in __GI__dl_catch_exception (exception=exception@entry=0x7fffffffa1b0, operate=0x7ffff7fcb660 <_dl_close>, args=0x555556b22680) at dl-catch.c:237
#3  0x00007ffff7fca679 in _dl_catch_error (objname=0x7fffffffa218, errstring=0x7fffffffa220, mallocedp=0x7fffffffa217, operate=<optimized out>, args=<optimized out>) at dl-catch.c:256
#4  0x00007ffff7aaa1f3 in _dlerror_run () from /lib64/libc.so.6
#5  0x00007ffff7aa9f26 in dlclose@GLIBC_2.2.5 () from /lib64/libc.so.6
#6  0x00007ffff79ee225 in loader_deactivate_layers.constprop () from /lib64/libvulkan.so.1
#7  0x00007ffff79e2e52 in vkDestroyInstance () from /lib64/libvulkan.so.1
#8  0x000055555627aaea in ash::instance::Instance::destroy_instance (self=0x55555714b920, allocation_callbacks=...) at /home/nical/.cargo/registry/src/index.crates.io-6f17d22bba15001f/ash-0.37.3+1.3.251/src/instance.rs:382
#9  0x000055555625e085 in wgpu_hal::vulkan::instance::{impl#4}::drop (self=0x55555714b920) at wgpu-hal/src/vulkan/instance.rs:560
#10 0x0000555556253fd7 in core::ptr::drop_in_place<wgpu_hal::vulkan::InstanceShared> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ptr/mod.rs:497
#11 0x000055555612e360 in alloc::sync::Arc<wgpu_hal::vulkan::InstanceShared>::drop_slow<wgpu_hal::vulkan::InstanceShared> (self=0x555556d85460) at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/alloc/src/sync.rs:1263
#12 0x0000555556257d12 in alloc::sync::{impl#27}::drop<wgpu_hal::vulkan::InstanceShared> (self=0x555556d85460) at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/alloc/src/sync.rs:1899
#13 0x00005555562557bb in core::ptr::drop_in_place<alloc::sync::Arc<wgpu_hal::vulkan::InstanceShared>> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ptr/mod.rs:497
#14 0x000055555609b05b in core::ptr::drop_in_place<wgpu_hal::vulkan::Instance> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ptr/mod.rs:497
#15 0x000055555609ca46 in core::ptr::drop_in_place<core::option::Option<wgpu_hal::vulkan::Instance>> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ptr/mod.rs:497
#16 0x0000555555df27a9 in core::ptr::drop_in_place<wgpu_core::instance::Instance> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ptr/mod.rs:497
#17 0x0000555555dfd857 in core::ptr::drop_in_place<wgpu_core::global::Global<wgpu_core::identity::IdentityManagerFactory>> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ptr/mod.rs:497
#18 0x0000555555df2a54 in core::ptr::drop_in_place<wgpu::backend::direct::Context> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ptr/mod.rs:497
#19 0x0000555555df2e80 in core::ptr::drop_in_place<dyn wgpu::context::DynContext> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ptr/mod.rs:497
#20 0x0000555555cf564b in alloc::sync::Arc<dyn wgpu::context::DynContext>::drop_slow<dyn wgpu::context::DynContext> (self=0x7fffffffaa10) at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/alloc/src/sync.rs:1263
#21 0x0000555555dfe36d in alloc::sync::{impl#27}::drop<dyn wgpu::context::DynContext> (self=0x7fffffffaa10) at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/alloc/src/sync.rs:1899
#22 0x0000555555df66ab in core::ptr::drop_in_place<alloc::sync::Arc<dyn wgpu::context::DynContext>> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ptr/mod.rs:497
#23 0x000055555574cb47 in core::ptr::drop_in_place<wgpu::Queue> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ptr/mod.rs:497
#24 0x000055555574d538 in core::ptr::drop_in_place<wgpu_test::run::TestingContext> () at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/ptr/mod.rs:497
#25 0x00005555556ff3b4 in wgpu_test::life_cycle::texture_destroy_initializer::{closure#0} (ctx=...) at tests/tests/life_cycle.rs:95
#26 0x000055555572a630 in wgpu_test::config::{impl#0}::run_sync::{closure#0}::{async_block#0}<wgpu_test::life_cycle::texture_destroy_initializer::{closure_env#0}> () at tests/src/config.rs:95
#27 0x00005555557638d9 in core::future::future::{impl#1}::poll<alloc::boxed::Box<(dyn core::future::future::Future<Output=()> + core::marker::Send), alloc::alloc::Global>> (self=..., cx=0x7fffffffc498)
    at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/future/future.rs:125
#28 0x000055555576242c in core::panic::unwind_safe::{impl#26}::poll<core::pin::Pin<alloc::boxed::Box<(dyn core::future::future::Future<Output=()> + core::marker::Send), alloc::alloc::Global>>> (self=..., cx=0x7fffffffc498)
    at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/panic/unwind_safe.rs:296
#29 0x0000555555769451 in futures_lite::future::{impl#9}::poll::{closure#0}<core::panic::unwind_safe::AssertUnwindSafe<core::pin::Pin<alloc::boxed::Box<(dyn core::future::future::Future<Output=()> + core::marker::Send), alloc::alloc::Global>>>> () at /home/nical/.cargo/registry/src/index.crates.io-6f17d22bba15001f/futures-lite-2.1.0/src/future.rs:588
#30 0x0000555555762464 in core::panic::unwind_safe::{impl#23}::call_once<core::task::poll::Poll<()>, futures_lite::future::{impl#9}::poll::{closure_env#0}<core::panic::unwind_safe::AssertUnwindSafe<core::pin::Pin<alloc::boxed::Box<(dyn core::future::future::Future<Output=()> + core::marker::Send), alloc::alloc::Global>>>>> (self=...) at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/core/src/panic/unwind_safe.rs:271
#31 0x0000555555777a8c in std::panicking::try::do_call<core::panic::unwind_safe::AssertUnwindSafe<futures_lite::future::{impl#9}::poll::{closure_env#0}<core::panic::unwind_safe::AssertUnwindSafe<core::pin::Pin<alloc::boxed::Box<(dyn core::future::future::Future<Output=()> + core::marker::Send), alloc::alloc::Global>>>>>, core::task::poll::Poll<()>> (data=0x7fffffffadc0) at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/std/src/panicking.rs:500
#32 0x0000555555777b5b in __rust_try ()
#33 0x00005555557779f7 in std::panicking::try<core::task::poll::Poll<()>, core::panic::unwind_safe::AssertUnwindSafe<futures_lite::future::{impl#9}::poll::{closure_env#0}<core::panic::unwind_safe::AssertUnwindSafe<core::pin::Pin<alloc::boxed::Box<(dyn core::future::future::Future<Output=()> + core::marker::Send), alloc::alloc::Global>>>>>> (f=...) at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/std/src/panicking.rs:464
#34 0x00005555557778bb in std::panic::catch_unwind<core::panic::unwind_safe::AssertUnwindSafe<futures_lite::future::{impl#9}::poll::{closure_env#0}<core::panic::unwind_safe::AssertUnwindSafe<core::pin::Pin<alloc::boxed::Box<(dyn core::future::future::Future<Output=()> + core::marker::Send), alloc::alloc::Global>>>>>, core::task::poll::Poll<()>> (f=...) at /rustc/eb26296b556cef10fb713a38f3d16b9886080f26/library/std/src/panic.rs:142
#35 0x00005555557693a9 in futures_lite::future::{impl#9}::poll<core::panic::unwind_safe::AssertUnwindSafe<core::pin::Pin<alloc::boxed::Box<(dyn core::future::future::Future<Output=()> + core::marker::Send), alloc::alloc::Global>>>> (
    self=..., cx=0x7fffffffc498) at /home/nical/.cargo/registry/src/index.crates.io-6f17d22bba15001f/futures-lite-2.1.0/src/future.rs:588
#36 0x0000555555737e7b in wgpu_test::run::execute_test::{async_fn#0} () at tests/src/run.rs:83

So I guess it is pretty risky (on linux at least where the driver infrastructure for GL and vulkan shares some stuff), to have instances of multiple APIs living side by side.

Repro steps

# Probably needs a dozen or so attempts to reproduce the crash
cargo xtask test buffer_destroy
# Replace the path of the executable with the one you can find using `cargo nextest list -v`
gdb --args /home/nical/dev/rust/wgpu/target/debug/deps/wgpu_test-417544e475ba8031 --test life_cycle --test-threads=1 --nocapture

Platform

Linux (Fedora 38), Mesa 23.1.9 with an AMD iGPU

nical commented 10 months ago

Updating mesa to 23.3.2 fixed the issue.