Crash: Access violation in call to vkSetDebugUtilsObjectName()

simonask commented 5 months ago

Description

Occasionally my tests crash with the following exception:

Exception 0xc0000005 encountered at address 0x7fff597eb714: Access violation reading location 0xffffffffffffffff

This happens in a test that exercises buffer memory mapping, but the stack trace does not point to memory mapping facilities in any obvious way. Rather it happens at different points, but always in connection with set_object_name() being called in connection with create_buffer(), sometimes even while creating the adapter/device pair when wgpu creates its internal staging buffers.

I can't reproduce in release builds (but that could just be a timing thing), and I also haven't seen it when running with cargo test --jobs 1. The tests run in parallel (normal cargo test) using headless adapters/devices, so other adapters and devices are potentially alive while this happens.

The Vulkan spec for vkResetEvent() state that "Host access to event must be externally synchronized", but it's not clear to me whether the event here is shared in any way.

Example stack trace:

vkResetEvent (@vkResetEvent:62507)
vkResetEvent (@vkResetEvent:62507)
vkEnumerateInstanceLayerProperties (@vkEnumerateInstanceLayerProperties:66186)
vkResetEvent (@vkResetEvent:62507)
union enum2$<core::result::Result<tuple$<>,ash::vk::enums::Result> > ash::extensions::ext::debug_utils::DebugUtils::set_debug_utils_object_name(struct ash::vk::definitions::Device, struct ash::vk::definitions::DebugUtilsObjectNameInfoEXT *) (c:\Users\simon\.cargo\registry\src\index.crates.io-6f17d22bba15001f\ash-0.37.3+1.3.251\src\extensions\ext\debug_utils.rs:40)
static void wgpu_hal::vulkan::DeviceShared::set_object_name<ash::vk::definitions::Buffer>(struct ash::vk::enums::ObjectType, struct ash::vk::definitions::Buffer, struct ref$<str$>) (c:\Users\simon\.cargo\registry\src\index.crates.io-6f17d22bba15001f\wgpu-hal-0.19.1\src\vulkan\device.rs:55)
union enum2$<core::result::Result<wgpu_hal::vulkan::Buffer,wgpu_hal::DeviceError> > wgpu_hal::vulkan::device::impl$4::create_buffer(struct wgpu_hal::vulkan::Device *, struct wgpu_hal::BufferDescriptor *) (c:\Users\simon\.cargo\registry\src\index.crates.io-6f17d22bba15001f\wgpu-hal-0.19.1\src\vulkan\device.rs:898)
union enum2$<core::result::Result<wgpu_core::resource::Buffer<wgpu_hal::vulkan::Api>,enum2$<wgpu_core::resource::CreateBufferError> > > wgpu_core::device::resource::Device<wgpu_hal::vulkan::Api>::create_buffer<wgpu_hal::vulkan::Api>(struct wgpu_types::BufferDescriptor<enum2$<core::option::Option<enum2$<alloc::borrow::Cow<str$> > > > > *, bool) (c:\Users\simon\.cargo\registry\src\index.crates.io-6f17d22bba15001f\wgpu-core-0.19.0\src\device\resource.rs:558)
wgpu_core::device::global::<impl wgpu_core::global::Global<G>>::device_create_buffer (c:\Users\simon\.cargo\registry\src\index.crates.io-6f17d22bba15001f\wgpu-core-0.19.0\src\device\global.rs:222)
struct tuple$<wgpu_core::id::Id<wgpu_core::resource::Buffer<wgpu_hal::empty::Api> >,wgpu::backend::wgpu_core::Buffer> wgpu::backend::wgpu_core::impl$7::device_create_buffer(struct wgpu::backend::wgpu_core::ContextWgpuCore *, struct wgpu_core::id::Id<wgpu_core::device::resource::Device<wgpu_hal::empty::Api> > *, struct wgpu::backend::wgpu_core::Device *, struct wgpu_types::BufferDescriptor<enum2$<core::option::Option<ref$<str$> > > > *) (c:\Users\simon\.cargo\registry\src\index.crates.io-6f17d22bba15001f\wgpu-0.19.1\src\backend\wgpu_core.rs:1220)
struct tuple$<wgpu::context::ObjectId,alloc::boxed::Box<dyn$<core::any::Any,core::marker::Send,core::marker::Sync>,alloc::alloc::Global> > wgpu::context::impl$5::device_create_buffer<wgpu::backend::wgpu_core::ContextWgpuCore>(struct wgpu::backend::wgpu_core::ContextWgpuCore *, struct wgpu::context::ObjectId *, struct ref$<dyn$<core::any::Any,core::marker::Send,core::marker::Sync> >, struct wgpu_types::BufferDescriptor<enum2$<core::option::Option<ref$<str$> > > > *) (c:\Users\simon\.cargo\registry\src\index.crates.io-6f17d22bba15001f\wgpu-0.19.1\src\context.rs:2287)
struct wgpu::Buffer wgpu::Device::create_buffer(struct wgpu_types::BufferDescriptor<enum2$<core::option::Option<ref$<str$> > > > *) (c:\Users\simon\.cargo\registry\src\index.crates.io-6f17d22bba15001f\wgpu-0.19.1\src\lib.rs:2514)
struct dreamcoat_render::buffer::buffer::Buffer<slice2$<i32> > dreamcoat_render::buffer::buffer::BufferBuilder<slice2$<i32> >::create<slice2$<i32> >(struct wgpu::Device *) (c:\code\the-community\dreamcoat\dreamcoat-render\src\buffer\buffer.rs:133)
void dreamcoat_render::download_belt::tests::download_buffer_subslice() (c:\code\the-community\dreamcoat\dreamcoat-render\src\download_belt.rs:388)
dreamcoat_render::download_belt::tests::download_buffer_subslice::{{closure}} (c:\code\the-community\dreamcoat\dreamcoat-render\src\download_belt.rs:385)
core::ops::function::FnOnce::call_once (@core::ops::function::FnOnce::call_once:11)
static void test::__rust_begin_short_backtrace<enum2$<core::result::Result<tuple$<>,alloc::string::String> >,enum2$<core::result::Result<tuple$<>,alloc::string::String> > (*)()>() (@7ff747b5b3cf..7ff747b5b427:3)
static void test::run_test::closure$0() (@7ff747b59e20..7ff747b59ed9:3)
static void std::sys_common::backtrace::__rust_begin_short_backtrace<test::run_test::closure_env$1,tuple$<> >() (@7ff747b1761a..7ff747b176a4:3)
static void core::ops::function::FnOnce::call_once<std::thread::impl$0::spawn_unchecked_::closure_env$1<test::run_test::closure_env$1,tuple$<> >,tuple$<> >() (@7ff747b1d460..7ff747b1d4d7:3)
static void std::sys::windows::thread::impl$0::new::thread_start() (@7ff748495c1c..7ff748495c9b:3)

Repro steps This is triggered deep into some custom engine code, but I'm not doing any unsafe shenanigans here, and so at minimum I would expect a validation error if I'm doing something wrong. If the problem isn't obvious to someone familiar with the code, I'll happily create a minimal repro case, just let me know.

Expected vs observed behavior Expected no crash. :-)

Platform Windows 11, wgpu 0.19.1, latest NVIDIA drivers

cwfitzgerald commented 5 months ago

This is really weird, and is seemingly caused by us setting a debug name? Then in the driver it calls reset event, assuming that information is correct - (note the random switch to vkEnumerateInstanceLayerProperties, suggesting that the trace isn't correct).

cwfitzgerald commented 5 months ago

To work around this, you can pass the https://docs.rs/wgpu/latest/wgpu/struct.InstanceFlags.html#associatedconstant.DISCARD_HAL_LABELS to your instance flags to prevent us from passing in the labels.

simonask commented 4 months ago

Oh interesting, I can reproduce this on macOS with MoltenVK. It always happens after the 2nd frame has been presented in my case.

It isn't obvious from the stack trace that this is the same problem, but the reason I still believe that to be the case is that adding wgpu::InstanceFlags::DISCARD_HAL_LABELS actually seems to fix the crash.

Backtrace:

vulkan_layer_chassis::BeginCommandBuffer(VkCommandBuffer_T*, VkCommandBufferBeginInfo const*) (@vulkan_layer_chassis::BeginCommandBuffer(VkCommandBuffer_T*, VkCommandBufferBeginInfo const*):42)
ash::device::Device::begin_command_buffer (/Users/simon/.cargo/registry/src/index.crates.io-6f17d22bba15001f/ash-0.37.3+1.3.251/src/device.rs:2382)
wgpu_hal::vulkan::command::<impl wgpu_hal::CommandEncoder<wgpu_hal::vulkan::Api> for wgpu_hal::vulkan::CommandEncoder>::begin_encoding (/Users/simon/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-hal-0.19.1/src/vulkan/command.rs:91)
wgpu_core::command::CommandEncoder<A>::open_pass (/Users/simon/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-core-0.19.0/src/command/mod.rs:100)
wgpu_core::command::render::<impl wgpu_core::global::Global<G>>::command_encoder_run_render_pass_impl (/Users/simon/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-core-0.19.0/src/command/render.rs:1359)
wgpu_core::command::render::<impl wgpu_core::global::Global<G>>::command_encoder_run_render_pass (/Users/simon/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-core-0.19.0/src/command/render.rs:1288)
<wgpu::backend::wgpu_core::ContextWgpuCore as wgpu::context::Context>::command_encoder_end_render_pass (/Users/simon/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-0.19.1/src/backend/wgpu_core.rs:1898)
<T as wgpu::context::DynContext>::command_encoder_end_render_pass (/Users/simon/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-0.19.1/src/context.rs:2760)
<wgpu::RenderPass as core::ops::drop::Drop>::drop (/Users/simon/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-0.19.1/src/lib.rs:4109)
core::ptr::drop_in_place<wgpu::RenderPass> (@core::ptr::drop_in_place<wgpu::RenderPass>:10)

gfx-rs / wgpu

Crash: Access violation in call to vkSetDebugUtilsObjectName() #5270