gfx-rs / wgpu

A cross-platform, safe, pure-Rust graphics API.
https://wgpu.rs
Apache License 2.0
11.51k stars 858 forks source link

[core] deadlock between `poll_all_devices` and `queue_submit` #5695

Open sagudev opened 1 month ago

sagudev commented 1 month ago

Description Deadlock between poll_all_devices (while deferred_resource_destruction on bind group destroy) and queue_submit (while device.maintain):

thread backtrace
  thread #111, name = 'WGPU'
    frame #0: 0x00007ffff6f2725d libc.so.6`syscall at syscall.S:38
    frame #1: 0x000055555bd38887 servo`parking_lot::raw_mutex::RawMutex::lock_slow at linux.rs:112:13
    frame #2: 0x000055555bd3886a servo`parking_lot::raw_mutex::RawMutex::lock_slow [inlined] <parking_lot_core::thread_parker::imp::ThreadParker as parking_lot_core::thread_parker::ThreadParkerT>::park at linux.rs:66:13
    frame #3: 0x000055555bd38864 servo`parking_lot::raw_mutex::RawMutex::lock_slow at parking_lot.rs:635:36
    frame #4: 0x000055555bd38809 servo`parking_lot::raw_mutex::RawMutex::lock_slow at parking_lot.rs:207:5
    frame #5: 0x000055555bd38809 servo`parking_lot::raw_mutex::RawMutex::lock_slow at parking_lot.rs:600:5
    frame #6: 0x000055555bd38809 servo`parking_lot::raw_mutex::RawMutex::lock_slow(self=0x00007ffe9db04cf0, timeout=Instant>{...}) at raw_mutex.rs:262:17
    frame #7: 0x000055555b0142d4 servo`core::ptr::drop_in_place<wgpu_core::resource::DestroyedBuffer<wgpu_hal::vulkan::Api>> [inlined] <parking_lot::raw_mutex::RawMutex as lock_api::mutex::RawMutex>::lock(self=0x00007ffe9db04cf0) at raw_mutex.rs:72:13
    frame #8: 0x000055555b0142b5 servo`core::ptr::drop_in_place<wgpu_core::resource::DestroyedBuffer<wgpu_hal::vulkan::Api>> at mutex.rs:223:9
    frame #9: 0x000055555b0142b5 servo`core::ptr::drop_in_place<wgpu_core::resource::DestroyedBuffer<wgpu_hal::vulkan::Api>> [inlined] wgpu_core::lock::vanilla::Mutex<T>::lock at vanilla.rs:29:27
    frame #10: 0x000055555b0142ac servo`core::ptr::drop_in_place<wgpu_core::resource::DestroyedBuffer<wgpu_hal::vulkan::Api>> [inlined] <wgpu_core::resource::DestroyedBuffer<A> as core::ops::drop::Drop>::drop(self=0x00007ffea941b150) at resource.rs:646:57
    frame #11: 0x000055555b0142a4 servo`core::ptr::drop_in_place<wgpu_core::resource::DestroyedBuffer<wgpu_hal::vulkan::Api>>((null)=0x00007ffea941b150) at mod.rs:497:1
    frame #12: 0x000055555b01855d servo`alloc::sync::Arc<T,A>::drop_slow(self=<unavailable>) at sync.rs:1751:18
    frame #13: 0x000055555b102c80 servo`core::ptr::drop_in_place<wgpu_core::device::life::ResourceMaps<wgpu_hal::vulkan::Api>> [inlined] <alloc::sync::Arc<T,A> as core::ops::drop::Drop>::drop(self=<unavailable>) at sync.rs:2407:13
    frame #14: 0x000055555b102c7b servo`core::ptr::drop_in_place<wgpu_core::device::life::ResourceMaps<wgpu_hal::vulkan::Api>> [inlined] core::ptr::drop_in_place<alloc::sync::Arc<wgpu_core::resource::DestroyedBuffer<wgpu_hal::vulkan::Api>>>((null)=<unavailable>) at mod.rs:497:1
    frame #15: 0x000055555b102c7b servo`core::ptr::drop_in_place<wgpu_core::device::life::ResourceMaps<wgpu_hal::vulkan::Api>> [inlined] core::ptr::drop_in_place<(wgpu_core::track::TrackerIndex,alloc::sync::Arc<wgpu_core::resource::DestroyedBuffer<wgpu_hal::vulkan::Api>>)>((null)=<unavailable>) at mod.rs:497:1
    frame #16: 0x000055555b102c6d servo`core::ptr::drop_in_place<wgpu_core::device::life::ResourceMaps<wgpu_hal::vulkan::Api>> [inlined] core::ptr::mut_ptr::<impl *mut T>::drop_in_place at mut_ptr.rs:1461:18
    frame #17: 0x000055555b102c6d servo`core::ptr::drop_in_place<wgpu_core::device::life::ResourceMaps<wgpu_hal::vulkan::Api>> [inlined] hashbrown::raw::Bucket<T>::drop(self=<unavailable>) at mod.rs:581:23
    frame #18: 0x000055555b102c6d servo`core::ptr::drop_in_place<wgpu_core::device::life::ResourceMaps<wgpu_hal::vulkan::Api>> at mod.rs:1038:17
    frame #19: 0x000055555b102bee servo`core::ptr::drop_in_place<wgpu_core::device::life::ResourceMaps<wgpu_hal::vulkan::Api>> [inlined] <hashbrown::raw::RawTable<T,A> as core::ops::drop::Drop>::drop(self=0x00007fff447f7480) at mod.rs:2699:17
    frame #20: 0x000055555b102bde servo`core::ptr::drop_in_place<wgpu_core::device::life::ResourceMaps<wgpu_hal::vulkan::Api>> [inlined] core::ptr::drop_in_place<hashbrown::raw::RawTable<(wgpu_core::track::TrackerIndex,alloc::sync::Arc<wgpu_core::resource::DestroyedBuffer<wgpu_hal::vulkan::Api>>)>>((null)=0x00007fff447f7480) at mod.rs:497:1
    frame #21: 0x000055555b102bde servo`core::ptr::drop_in_place<wgpu_core::device::life::ResourceMaps<wgpu_hal::vulkan::Api>> [inlined] core::ptr::drop_in_place<hashbrown::map::HashMap<wgpu_core::track::TrackerIndex,alloc::sync::Arc<wgpu_core::resource::DestroyedBuffer<wgpu_hal::vulkan::Api>>,core::hash::BuildHasherDefault<rustc_hash::FxHasher>>>((null)=0x00007fff447f7480) at mod.rs:497:1
    frame #22: 0x000055555b102bde servo`core::ptr::drop_in_place<wgpu_core::device::life::ResourceMaps<wgpu_hal::vulkan::Api>> [inlined] core::ptr::drop_in_place<std::collections::hash::map::HashMap<wgpu_core::track::TrackerIndex,alloc::sync::Arc<wgpu_core::resource::DestroyedBuffer<wgpu_hal::vulkan::Api>>,core::hash::BuildHasherDefault<rustc_hash::FxHasher>>>((null)=size=1, capacity=4) at mod.rs:497:1
    frame #23: 0x000055555b102bde servo`core::ptr::drop_in_place<wgpu_core::device::life::ResourceMaps<wgpu_hal::vulkan::Api>>((null)=0x00007fff447f7300) at mod.rs:497:1
    frame #24: 0x000055555b111172 servo`wgpu_core::device::life::LifetimeTracker<A>::triage_submissions(self=0x00007ffe9db04888, last_done=<unavailable>, command_allocator=0x00007ffe9db04538) at life.rs:413:9
    frame #25: 0x000055555b0d5d08 servo`wgpu_core::device::resource::Device<A>::maintain(self=0x00007ffe9db02010, fence_guard=wgpu_core::lock::vanilla::RwLockReadGuard<core::option::Option<wgpu_hal::vulkan::Fence>> @ r13, maintain=<unavailable>, snatch_guard=<unavailable>) at resource.rs:436:13
    frame #26: 0x000055555b075ca9 servo`wgpu_core::device::queue::<impl wgpu_core::global::Global>::queue_submit(self=<unavailable>, queue_id=<unavailable>, command_buffer_ids=<unavailable>) at queue.rs:1494:23
thread backtrace
  thread #112, name = 'WGPU poller'
    frame #0: 0x00007ffff6f2725d libc.so.6`syscall at syscall.S:38
    frame #1: 0x000055555bd36cd2 servo`parking_lot::raw_rwlock::RawRwLock::wait_for_readers at linux.rs:112:13
    frame #2: 0x000055555bd36cb5 servo`parking_lot::raw_rwlock::RawRwLock::wait_for_readers [inlined] <parking_lot_core::thread_parker::imp::ThreadParker as parking_lot_core::thread_parker::ThreadParkerT>::park at linux.rs:66:13
    frame #3: 0x000055555bd36c96 servo`parking_lot::raw_rwlock::RawRwLock::wait_for_readers at parking_lot.rs:635:36
    frame #4: 0x000055555bd36b1b servo`parking_lot::raw_rwlock::RawRwLock::wait_for_readers at parking_lot.rs:207:5
    frame #5: 0x000055555bd36a85 servo`parking_lot::raw_rwlock::RawRwLock::wait_for_readers at parking_lot.rs:600:5
    frame #6: 0x000055555bd36a85 servo`parking_lot::raw_rwlock::RawRwLock::wait_for_readers(self=0x00007ffe9db045a8, timeout=Instant>{...}, prev_value=0) at raw_rwlock.rs:1017:17
    frame #7: 0x000055555bd3409e servo`parking_lot::raw_rwlock::RawRwLock::lock_exclusive_slow(self=0x00007ffe9db045a8, timeout=Instant>{...}) at raw_rwlock.rs:647:9
    frame #8: 0x000055555b176db3 servo`wgpu_core::device::resource::Device<A>::deferred_resource_destruction [inlined] <parking_lot::raw_rwlock::RawRwLock as lock_api::rwlock::RawRwLock>::lock_exclusive(self=0x00007ffe9db045a8) at raw_rwlock.rs:73:26
    frame #9: 0x000055555b176d97 servo`wgpu_core::device::resource::Device<A>::deferred_resource_destruction at rwlock.rs:500:9
    frame #10: 0x000055555b176d97 servo`wgpu_core::device::resource::Device<A>::deferred_resource_destruction [inlined] wgpu_core::lock::vanilla::RwLock<T>::write at vanilla.rs:85:33
    frame #11: 0x000055555b176d97 servo`wgpu_core::device::resource::Device<A>::deferred_resource_destruction [inlined] wgpu_core::snatch::SnatchLock::write(self=0x00007ffe9db045a8) at snatch.rs:154:40
    frame #12: 0x000055555b176d7f servo`wgpu_core::device::resource::Device<A>::deferred_resource_destruction(self=0x00007ffe9db02010) at resource.rs:359:70
    frame #13: 0x000055555b1640d0 servo`wgpu_core::device::global::<impl wgpu_core::global::Global>::poll_all_devices at global.rs:2152:9
    frame #14: 0x000055555b163fe9 servo`wgpu_core::device::global::<impl wgpu_core::global::Global>::poll_all_devices at global.rs:2188:21
    frame #15: 0x000055555b163e9f servo`wgpu_core::device::global::<impl wgpu_core::global::Global>::poll_all_devices(self=<unavailable>, force_wait=<unavailable>) at global.rs:2213:17
    frame #16: 0x000055555b051c66 servo`std::sys_common::backtrace::__rust_begin_short_backtrace [inlined]

Repro steps Similar to https://github.com/gfx-rs/wgpu/issues/5687, but this one is way harder to reproduce (happens more rarely). Using servo: https://github.com/servo/servo/pull/32266/commits/258406ddbdef5cf526a5a7f83e0fb7871470ec73 and running https://gpuweb.github.io/cts/standalone/?runnow=1&q=webgpu:api,operation,compute,basic:*.

Expected vs observed behavior Expected no deadlock.

Platform wgpu-core on trunk (d0a5e48aa7e84683114c3870051cc414ae92ac03)

sagudev commented 1 month ago

One thread tries to acquire lock on https://github.com/gfx-rs/wgpu/blob/fa48562229ae9effe10696e74249304f1fb9a3f0/wgpu-core/src/resource.rs#L646 while other already acquired it: https://github.com/gfx-rs/wgpu/blob/fa48562229ae9effe10696e74249304f1fb9a3f0/wgpu-core/src/device/resource.rs#L335 this second thread tries to acquire: https://github.com/gfx-rs/wgpu/blob/fa48562229ae9effe10696e74249304f1fb9a3f0/wgpu-core/src/device/resource.rs#L359 that is already acquired by https://github.com/gfx-rs/wgpu/blob/fa48562229ae9effe10696e74249304f1fb9a3f0/wgpu-core/src/device/queue.rs#L1162 and later passed on to https://github.com/gfx-rs/wgpu/blob/fa48562229ae9effe10696e74249304f1fb9a3f0/wgpu-core/src/device/queue.rs#L1494