swaywm / sway

i3-compatible Wayland compositor
https://swaywm.org
MIT License
14.69k stars 1.11k forks source link

sway+chromium crashes when an execbuffer relocation references a I915_MADV_DONTNEED object #5128

Open thejh opened 4 years ago

thejh commented 4 years ago

I'm still in the process of debugging this, but figured I'd file an issue here for now to track things, in case anyone else is concurrently trying to figure out the same issue:

When running a build of Chromium git master with native wayland support (built with use_ozone = true, invoked with --ozone-platform=wayland) on a system with integrated Intel graphics, sway occasionally dies with the error message i965: Failed to submit batchbuffer: Bad address, which comes from mesa's submit_batch(). The cause of this error message appears to be that the execbuffer's validation list contains entries pointing to objects in I915_MADV_DONTNEED state.

My suspicion so far is that the core problem is somewhere in Chromium, and mesa is making things worse by not being able to handle this condition gracefully; but I'm not sure yet.

To be able to reproduce this more reliably, you can hack a new API for testing the status of an object into the kernel:

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5f6e63952821..371ccb3aa6db 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1024,6 +1024,7 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
    switch (args->madv) {
    case I915_MADV_DONTNEED:
    case I915_MADV_WILLNEED:
+   case 0x42:
        break;
    default:
        return -EINVAL;
@@ -1037,7 +1038,7 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
    if (err)
        goto out;

-   if (i915_gem_object_has_pages(obj) &&
+   if (args->madv != 0x42 && i915_gem_object_has_pages(obj) &&
        i915_gem_object_is_tiled(obj) &&
        i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
        if (obj->mm.madv == I915_MADV_WILLNEED) {
@@ -1052,10 +1053,12 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
        }
    }

-   if (obj->mm.madv != __I915_MADV_PURGED)
+   if (obj->mm.madv != __I915_MADV_PURGED && args->madv != 0x42)
        obj->mm.madv = args->madv;
+   if (args->madv == 0x42)
+       args->madv = obj->mm.madv;

-   if (i915_gem_object_has_pages(obj)) {
+   if (args->madv != 0x42 && i915_gem_object_has_pages(obj)) {
        struct list_head *list;

        if (i915_gem_object_is_shrinkable(obj)) {

and then teach mesa to use that when adding things to an execbuffer's validation list (I really hope this is actually correct and doesn't just trigger false warnings):

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index f1465ed3556..548d44458c2 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -58,6 +58,17 @@ intel_batchbuffer_reset(struct brw_context *brw);
 static void
 brw_new_batch(struct brw_context *brw);

+static void assert_needed(__DRIscreen *dri_screen, unsigned int handle)
+{
+   struct drm_i915_gem_madvise madv = {
+      .handle = handle,
+      .madv = 0x42,
+      .retained = -1,
+   };
+   assert(ioctl(dri_screen->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv) == 0);
+   assert(madv.madv == I915_MADV_WILLNEED);
+}
+
 static void
 dump_validation_list(struct intel_batchbuffer *batch)
 {
@@ -177,6 +188,9 @@ static unsigned
 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
 {
    assert(bo->bufmgr == batch->batch.bo->bufmgr);
+   struct brw_context *brw = NULL;
+   brw = container_of(batch, brw, batch);
+   __DRIscreen *dri_screen = brw->screen->driScrnPriv;

    unsigned index = READ_ONCE(bo->index);

@@ -201,6 +215,8 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
                  batch->exec_array_size * sizeof(batch->validation_list[0]));
    }

+   assert_needed(dri_screen, bo->gem_handle);
+
    batch->validation_list[batch->exec_count] =
       (struct drm_i915_gem_exec_object2) {
          .handle = bo->gem_handle,
@@ -783,6 +799,13 @@ submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
          assert(entry->handle == batch->state.bo->gem_handle);
          entry->relocation_count = batch->state_relocs.reloc_count;
          entry->relocs_ptr = (uintptr_t) batch->state_relocs.relocs;
+
+         if (!batch->use_batch_first) {
+            for (int i=0; i<batch->state_relocs.reloc_count; i++) {
+               struct drm_i915_gem_relocation_entry *entry = batch->state_relocs.relocs + i;
+               assert_needed(dri_screen, entry->target_handle);
+            }
+         }
       }

       /* Set batchbuffer relocations */
@@ -791,6 +814,13 @@ submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
       entry->relocation_count = batch->batch_relocs.reloc_count;
       entry->relocs_ptr = (uintptr_t) batch->batch_relocs.relocs;

+      if (!batch->use_batch_first) {
+         for (int i=0; i<batch->batch_relocs.reloc_count; i++) {
+            struct drm_i915_gem_relocation_entry *entry = batch->batch_relocs.relocs + i;
+            assert_needed(dri_screen, entry->target_handle);
+         }
+      }
+
       if (batch->use_batch_first) {
          flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
       } else {
@@ -808,6 +838,10 @@ submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
          batch->exec_bos[index] = tmp_bo;
       }

+      for (int i=0; i<batch->exec_count; i++) {
+         assert_needed(dri_screen, batch->validation_list[i].handle);
+      }
+
       ret = execbuffer(dri_screen->fd, batch, brw->hw_ctx,
                        4 * USED_BATCH(*batch),
                        in_fence_fd, out_fence_fd, flags);
@@ -926,6 +960,9 @@ emit_reloc(struct intel_batchbuffer *batch,
            struct brw_bo *target, int32_t target_offset,
            unsigned int reloc_flags)
 {
+   struct brw_context *brw = NULL;
+   brw = container_of(batch, brw, batch);
+   __DRIscreen *dri_screen = brw->screen->driScrnPriv;
    assert(target != NULL);

    if (target->kflags & EXEC_OBJECT_PINNED) {
@@ -961,6 +998,9 @@ emit_reloc(struct intel_batchbuffer *batch,
    if (reloc_flags)
       entry->flags |= reloc_flags & batch->valid_reloc_flags;

+   if (!batch->use_batch_first)
+      assert_needed(dri_screen, target->gem_handle);
+
    rlist->relocs[rlist->reloc_count++] =
       (struct drm_i915_gem_relocation_entry) {
          .offset = offset,

The assert triggers as soon as I launch Chromium.

Based on some debugging I've done with strace, the handles referenced by these validation list entries seem to be for relocations, and seem to be allocated via DRM_IOCTL_PRIME_FD_TO_HANDLE, from here:

77661 ioctl(9, DRM_IOCTL_PRIME_FD_TO_HANDLE <unfinished ...>
[...]
77661 <... ioctl resumed>, {handle=0x19, flags=0, fd=55}) = 0
 > /lib/x86_64-linux-gnu/libc-2.30.so(ioctl+0x7) [0xf43e7]
 > /usr/lib/x86_64-linux-gnu/libdrm.so.2.4.0(drmIoctl+0x27) [0x5cc7]
 > /usr/lib/x86_64-linux-gnu/libdrm.so.2.4.0(drmPrimeFDToHandle+0x33) [0x8e63]
 > /usr/local/lib/x86_64-linux-gnu/dri/i965_dri.so(brw_bo_gem_create_from_prime_internal+0x3a) [0x131d49]
 > /usr/local/lib/x86_64-linux-gnu/dri/i965_dri.so(brw_bo_gem_create_from_prime_tiled+0x5a) [0x132039]
 > /usr/local/lib/x86_64-linux-gnu/dri/i965_dri.so(intel_create_image_from_fds_common+0x1ad) [0x17faf1]
 > /usr/local/lib/x86_64-linux-gnu/dri/i965_dri.so(intel_create_image_from_dma_bufs2+0x6f) [0x1801a5]
 > /usr/local/lib/x86_64-linux-gnu/libEGL.so.1.0.0(dri2_create_image_dma_buf+0x225) [0x1fb99]
 > /usr/local/lib/x86_64-linux-gnu/libEGL.so.1.0.0(dri2_create_image_khr+0xf9) [0x2037f]
 > /usr/local/lib/x86_64-linux-gnu/libEGL.so.1.0.0(dri2_drm_create_image_khr+0x66) [0x27610]
 > /usr/local/lib/x86_64-linux-gnu/libEGL.so.1.0.0(dri2_create_image+0x5c) [0x1e4a6]
 > /usr/local/lib/x86_64-linux-gnu/libEGL.so.1.0.0(_eglCreateImageCommon+0x13f) [0xf4ba]
 > /usr/local/lib/x86_64-linux-gnu/libEGL.so.1.0.0(eglCreateImageKHR+0x83) [0xf5a1]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(wlr_egl_create_image_from_dmabuf+0x48e) [0x29340]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(wlr_gles2_texture_from_dmabuf+0x1a2) [0x2c1d3]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(gles2_texture_from_dmabuf+0x36) [0x2ab47]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(wlr_texture_from_dmabuf+0x47) [0x2cf48]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(wlr_client_buffer_create+0x16e) [0x60fa4]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(surface_apply_damage+0xb8) [0x7e45d]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(surface_commit_pending+0x11d) [0x7e6ed]
 > /usr/local/lib/x86_64-linux-gnu/libwlroots.so.5(surface_commit+0x67) [0x7ea52]
 > /usr/lib/x86_64-linux-gnu/libffi.so.7.1.0(ffi_call_unix64+0x54) [0x6ccc]
 > /usr/lib/x86_64-linux-gnu/libffi.so.7.1.0(ffi_call_int+0x1a9) [0x6259]
 > /usr/lib/x86_64-linux-gnu/libwayland-server.so.0.1.0(wl_closure_invoke+0x15f) [0xd37f]
 > /usr/lib/x86_64-linux-gnu/libwayland-server.so.0.1.0(wl_client_connection_data+0x231) [0x97f1]
 > /usr/lib/x86_64-linux-gnu/libwayland-server.so.0.1.0(wl_event_loop_dispatch+0xc1) [0xb401]
 > /usr/lib/x86_64-linux-gnu/libwayland-server.so.0.1.0(wl_display_run+0x24) [0x99e4]
 > /usr/local/bin/sway(server_run+0x47) [0x1ef4f]
 > /usr/local/bin/sway(main+0x583) [0x1e532]
 > /lib/x86_64-linux-gnu/libc-2.30.so(__libc_start_main+0xea) [0x26e0a]
 > /usr/local/bin/sway(_start+0x29) [0x102b9]

and in at least one case, the handle actually initially was I915_MADV_WILLNEED, but then became I915_MADV_DONTNEED at a later point without any relevant-looking syscalls from sway.

The file descriptor from which the handle was created was received from Chromium over a unix domain socket; in Chromium, the file descriptor comes from this spot:

77702 fcntl(714, F_DUPFD_CLOEXEC, 0 <unfinished ...>
[...]
77702 <... fcntl resumed>)              = 720
 > /lib/x86_64-linux-gnu/libc-2.30.so(__fcntl64_nocancel_adjusted+0x20) [0xf34a0]
 > /lib/x86_64-linux-gnu/libc-2.30.so(__libc_fcntl64+0x43) [0xeea43]
 > /lib/x86_64-linux-gnu/libpthread-2.30.so(fcntl@GLIBC_2.2.5+0x3a) [0x12d9a]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(wl_os_dupfd_cloexec+0x1d) [0xdb95d]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(wl_closure_marshal+0x10a) [0xda0da]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(wl_proxy_marshal_array_constructor_versioned+0x16e) [0xd792e]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(wl_proxy_marshal+0x10f) [0xd7aef]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(ui::WaylandZwpLinuxDmabuf::CreateBuffer(base::ScopedGeneric<int, base::internal::ScopedFDCloseTraits>, gfx::Size const&, std::__Cr::vector<unsigned int, std::__Cr::allocator<unsigned int> > const&, std::__Cr::vector<unsigned int, std::__Cr::allocator<unsigned int> > const&, std::__Cr::vector<unsigned long, std::__Cr::allocator<unsigned long> > const&, unsigned int, unsigned int, base::OnceCallback<void (wl::Object<wl_buffer>)>)+0x92) [0xc6fa2]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(ui::WaylandBufferManagerHost::CreateDmabufBasedBuffer(mojo::PlatformHandle, gfx::Size const&, std::__Cr::vector<unsigned int, std::__Cr::allocator<unsigned int> > const&, std::__Cr::vector<unsigned int, std::__Cr::allocator<unsigned int> > const&, std::__Cr::vector<unsigned long, std::__Cr::allocator<unsigned long> > const&, unsigned int, unsigned int, unsigned int)+0x22b) [0xb0c9b]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(ui::ozone::mojom::WaylandBufferManagerHostStubDispatch::Accept(ui::ozone::mojom::WaylandBufferManagerHost*, mojo::Message*)+0x8d2) [0xce462]
 > /home/jann/chromium/src/out/mybuild_stable/libbindings.so(mojo::InterfaceEndpointClient::HandleValidatedMessage(mojo::Message*)+0x295) [0x25095]
 > /home/jann/chromium/src/out/mybuild_stable/libbindings.so(mojo::internal::MultiplexRouter::ProcessIncomingMessage(mojo::internal::MultiplexRouter::MessageWrapper*, mojo::internal::MultiplexRouter::ClientCallBehavior, base::SequencedTaskRunner*)+0x2f7) [0x2b8c7]
 > /home/jann/chromium/src/out/mybuild_stable/libbindings.so(mojo::internal::MultiplexRouter::Accept(mojo::Message*)+0x116) [0x2b0b6]
 > /home/jann/chromium/src/out/mybuild_stable/libbindings.so(mojo::Connector::DispatchMessage(mojo::Message)+0xf4) [0x202a4]
 > /home/jann/chromium/src/out/mybuild_stable/libbindings.so(mojo::Connector::DispatchNextMessageInQueue()+0x18e) [0x1ff6e]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::TaskAnnotator::RunTask(char const*, base::PendingTask*)+0x12a) [0x18656a]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::sequence_manager::internal::ThreadControllerWithMessagePumpImpl::DoWorkImpl(base::sequence_manager::LazyNow*, bool*)+0x17d) [0x1990fd]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::sequence_manager::internal::ThreadControllerWithMessagePumpImpl::DoSomeWork()+0x9f) [0x198e8f]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::MessagePumpGlib::Run(base::MessagePump::Delegate*)+0x89) [0x13f0d9]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::sequence_manager::internal::ThreadControllerWithMessagePumpImpl::Run(bool, base::TimeDelta)+0x74) [0x199904]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::RunLoop::Run()+0x18d) [0x169e1d]
 > /home/jann/chromium/src/out/mybuild_stable/chrome(ChromeBrowserMainParts::MainMessageLoopRun(int*)+0x72) [0x152a0e2]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::BrowserMainLoop::RunMainMessageLoopParts()+0x3a) [0xbb71ca]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::BrowserMainRunnerImpl::Run()+0x11) [0xbb8f51]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::BrowserMain(content::MainFunctionParams const&)+0xcc) [0xbb477c]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::ContentMainRunnerImpl::RunServiceManager(content::MainFunctionParams&, bool)+0x3c8) [0x13c9668]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::ContentMainRunnerImpl::Run(bool)+0x126) [0x13c9276]
 > /home/jann/chromium/src/out/mybuild_stable/libembedder.so(service_manager::Main(service_manager::MainParams const&)+0xd33) [0xf753]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::ContentMain(content::ContentMainParams const&)+0x80) [0x13c74d0]
 > /home/jann/chromium/src/out/mybuild_stable/chrome(ChromeMain+0xcd) [0xf11cfd]
 > /lib/x86_64-linux-gnu/libc-2.30.so(__libc_start_main+0xea) [0x26e0a]
 > /home/jann/chromium/src/out/mybuild_stable/chrome(_start+0x29) [0xf11b39]
thejh commented 4 years ago

Oh, and in case someone encountering this just wants a quick workaround: Try putting the following into ~/.drirc:

<driconf>
  <device screen="0" driver="i965">
    <application name="chrome" executable="chrome">
      <option name="bo_reuse" value="0" />
    </application>
  </device>
</driconf>

At least in a quick test, this seems to prevent the problem from triggering (probably at the cost of making Chromium a little bit slower).

thejh commented 4 years ago

Hm... normally, when buffer objects are exported via brw_bo_gem_export_to_prime(), bo->reusable is set to false. But Chromium's CreateBufferForBO() calls GetPlaneFdForBo(), which directly calls drmPrimeHandleToFD() (just an example call chain, not necessarily the culprit):

77929 <... ioctl resumed>)              = 0
 > /lib/x86_64-linux-gnu/libc-2.30.so(ioctl+0x7) [0xf43e7]
 > /usr/lib/x86_64-linux-gnu/libdrm.so.2.4.0(drmIoctl+0x27) [0x5cc7]
 > /usr/lib/x86_64-linux-gnu/libdrm.so.2.4.0(drmPrimeHandleToFD+0x36) [0x8e06]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(gbm_wrapper::CreateBufferForBO(gbm_bo*, unsigned int, gfx::Size const&, unsigned int)+0x24e) [0x9effe]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(gbm_wrapper::Device::CreateBufferWithModifiers(unsigned int, gfx::Size const&, unsigned int, std::__Cr::vector<unsigned long, std::__Cr::allocator<unsigned long> > const&)+0xd2) [0x9fd82]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(ui::GbmPixmapWayland::InitializeBuffer(gfx::Size, gfx::BufferFormat, gfx::BufferUsage)+0xfc) [0xca4dc]
 > /home/jann/chromium/src/out/mybuild_stable/libozone.so(ui::WaylandSurfaceFactory::CreateNativePixmap(int, VkDevice_T*, gfx::Size, gfx::BufferFormat, gfx::BufferUsage, base::Optional<gfx::Size>)+0x5c) [0xac51c]
 > /home/jann/chromium/src/out/mybuild_stable/libgpu_ipc_service.so(gpu::GpuMemoryBufferFactoryNativePixmap::CreateAnonymousImage(gfx::Size const&, gfx::BufferFormat, gfx::BufferUsage, int, bool*)+0xad) [0x4890d]
 > /home/jann/chromium/src/out/mybuild_stable/libgpu_ipc_service.so(non-virtual thunk to gpu::GpuMemoryBufferFactoryNativePixmap::CreateAnonymousImage(gfx::Size const&, gfx::BufferFormat, gfx::BufferUsage, int, bool*)+0x19) [0x48c69]
 > /home/jann/chromium/src/out/mybuild_stable/libgles2.so(gpu::SharedImageBackingFactoryGLTexture::CreateSharedImageInternal(gpu::Mailbox const&, viz::ResourceFormat, int, gfx::Size const&, gfx::ColorSpace const&, unsigned int, base::span<unsigned char const, 18446744073709551615ul>)+0x580) [0x23a390]
 > /home/jann/chromium/src/out/mybuild_stable/libgles2.so(gpu::SharedImageBackingFactoryGLTexture::CreateSharedImage(gpu::Mailbox const&, viz::ResourceFormat, int, gfx::Size const&, gfx::ColorSpace const&, unsigned int, bool)+0x41) [0x239e01]
 > /home/jann/chromium/src/out/mybuild_stable/libgles2.so(gpu::SharedImageFactory::CreateSharedImage(gpu::Mailbox const&, viz::ResourceFormat, gfx::Size const&, gfx::ColorSpace const&, int, unsigned int)+0x91) [0x23d031]
 > /home/jann/chromium/src/out/mybuild_stable/libservice.so(viz::SkiaOutputDeviceBufferQueue::Image::Initialize(gfx::Size const&, gfx::ColorSpace const&, viz::ResourceFormat, viz::SkiaOutputSurfaceDependency*, unsigned int)+0x4d) [0x144b5d]
 > /home/jann/chromium/src/out/mybuild_stable/libservice.so(viz::SkiaOutputDeviceBufferQueue::Reshape(gfx::Size const&, float, gfx::ColorSpace const&, gfx::BufferFormat, gfx::OverlayTransform)+0x1d6) [0x144a16]
 > /home/jann/chromium/src/out/mybuild_stable/libservice.so(viz::SkiaOutputSurfaceImplOnGpu::Reshape(gfx::Size const&, float, gfx::ColorSpace const&, gfx::BufferFormat, bool, gfx::OverlayTransform)+0xcd) [0x150c8d]
 > /home/jann/chromium/src/out/mybuild_stable/libservice.so(base::internal::Invoker<base::internal::BindState<viz::SkiaOutputSurfaceImpl::ScheduleGpuTask(base::OnceCallback<void ()>, std::__Cr::vector<gpu::SyncToken, std::__Cr::allocator<gpu::SyncToken> >)::$_2, base::OnceCallback<void ()> >, void ()>::RunOnce(base::internal::BindStateBase*)+0x49) [0x14faa9]
 > /home/jann/chromium/src/out/mybuild_stable/libgpu.so(gpu::Scheduler::RunNextTask()+0x404) [0x87854]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::TaskAnnotator::RunTask(char const*, base::PendingTask*)+0x12a) [0x18656a]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::sequence_manager::internal::ThreadControllerWithMessagePumpImpl::DoWorkImpl(base::sequence_manager::LazyNow*, bool*)+0x17d) [0x1990fd]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::sequence_manager::internal::ThreadControllerWithMessagePumpImpl::DoSomeWork()+0x9f) [0x198e8f]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::MessagePumpDefault::Run(base::MessagePump::Delegate*)+0x59) [0x13e9b9]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::sequence_manager::internal::ThreadControllerWithMessagePumpImpl::Run(bool, base::TimeDelta)+0x74) [0x199904]
 > /home/jann/chromium/src/out/mybuild_stable/libbase.so(base::RunLoop::Run()+0x18d) [0x169e1d]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::GpuMain(content::MainFunctionParams const&)+0x6f7) [0x84ea87]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::RunZygote(content::ContentMainDelegate*)+0xae7) [0x13c7ff7]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::ContentMainRunnerImpl::Run(bool)+0x114) [0x13c9264]
 > /home/jann/chromium/src/out/mybuild_stable/libembedder.so(service_manager::Main(service_manager::MainParams const&)+0xd33) [0xf753]
 > /home/jann/chromium/src/out/mybuild_stable/libcontent.so(content::ContentMain(content::ContentMainParams const&)+0x80) [0x13c74d0]
 > /home/jann/chromium/src/out/mybuild_stable/chrome(ChromeMain+0xcd) [0xf11cfd]
 > /lib/x86_64-linux-gnu/libc-2.30.so(__libc_start_main+0xea) [0x26e0a]
 > /home/jann/chromium/src/out/mybuild_stable/chrome(_start+0x29) [0xf11b39]

I guess it's time for me to file a Chromium bug...

thejh commented 4 years ago

FWIW, I've filed https://bugs.chromium.org/p/chromium/issues/detail?id=1063680. From what I can tell, a proper fix is going to be annoying to implement because it requires extending the GBM interface and adding plumbing in mesa.

emersion commented 4 years ago

Shouldn't Chromium keep the buffer alive till wl_buffer.release is received?

thejh commented 4 years ago

Oh, I didn't realize that that callback existed... I guess I'll take another look.

travankor commented 3 years ago

I had similar crashing on Wayfire, so this is a wlroots issue, right? (Doesn't seem to affect the iris driver, though)

mindrunner commented 2 years ago

My chromium randomly crashes and drags down everything returning me to tty1. Happens when I am e.g. in tabbed layout and then switching TO chromium window. How can I verify I am facing exactly this issue you are describing here?

thejh commented 2 years ago

@mindrunner you could try the bo_reuse workaround I posted further up in the thread

mindrunner commented 2 years ago

yeah, just did that. Sometimes it does not happen for days, tho.

mindrunner commented 2 years ago

I understand, this is a chrome bug (possibly). However, I wonder if there is no way so that a client crashes if it fuck up things but not pull down the whole rest of that system. Isn't that kind of the idea to have a user-space and separate things. Especially in wayland world where it is pain in the ass to share a screen for security reasons, but then some buggy code is able to make everything crash. Is there no way for sway to check if that piece of memory is valid and allowed to access?

thejh commented 2 years ago

@mindrunner AFAIU sway can't really do that on its own (in a non-racy way) because the decision to exit the whole process when the kernel signals an error happens in mesa. So to fix that part, you'd probably have to go change mesa to add some kinda retry logic for the error case to figure out what went wrong and remove problematic parts.

mindrunner commented 2 years ago

Makes sense. Would be a killer feature for me tho. Having a stable environment in daily business has so much value. Guess, it's not easy to implement :(

mindrunner commented 2 years ago

The bo_reuse workaround seems to mitigate the issue for me. Didn't have a crash with chromium since then. However, the instability of sway due to badly programmed 3rd party software is really giving me doubts if this is suitable as a daily driver setup. I just had a full sway/wayland crash after coming out of a screenshare/huddle session in slack. Slack UI was unresponsive and I was not able to end the call. After killing the client, the whole Desktop went down and I found myself back on tty1 :(

I am aware, this is hard to debug and super pain in the ass. Bug can be anywhere. kernel/drivers/slack/chromium/electron/wlroots/sway. Uff, what a long list...

Still I wonder if

1) there is anything I can do? Am I supposed to have debug symbols attached all the time to be able to send stack-traces? Does that have a significant performance impact?

2) is there effort put into this at all? Or is the only solution to figure out, it's not a sway bug, so we have to report somewhere else?

Sorry for being bit off topic and polluting here.