tracel-ai / burn

Burn is a new comprehensive dynamic Deep Learning Framework built using Rust with extreme flexibility, compute efficiency and portability as its primary goals.
https://burn.dev
Apache License 2.0
8.41k stars 413 forks source link

assertion `left == right` failed left: 0 right: 1 only on Epoch 27 #2190

Open jguhlin opened 3 weeks ago

jguhlin commented 3 weeks ago

Describe the bug Weird assertion error, but only on epoch 27 (tested 5 times now, crash is always the same spot, even if batch size is changed). Using main branch.

Error is this line:

let branches = Tensor::cat(branches, 0).to_device(&self.device);

here: https://github.com/jguhlin/taxotango/blob/9fb413e7531a8bd9f5724d215c47eade9163489a/src/model.rs#L217

To Reproduce Going to be here: https://github.com/jguhlin/taxotango/tree/Inference-and-Queries cargo run --release But you need these files and the path: wget https://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz

Expected behavior

Screenshots

[Train - Epoch 27 - Iteration 966] Loss 0.614
thread '<unnamed>' panicked at /home/joseph/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-core-22.1.0/src/id.rs:50:9:
assertion `left == right` failed
  left: 0
 right: 1
stack backtrace:
   0:     0x6226c800fb65 - std::backtrace_rs::backtrace::libunwind::trace::h1a07e5dba0da0cd2
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/../../backtrace/src/backtrace/libunwind.rs:105:5
   1:     0x6226c800fb65 - std::backtrace_rs::backtrace::trace_unsynchronized::h61b9b8394328c0bc
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5
   2:     0x6226c800fb65 - std::sys_common::backtrace::_print_fmt::h1c5e18b460934cff
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:68:5
   3:     0x6226c800fb65 - <std::sys_common::backtrace::_print::DisplayBacktrace as core::fmt::Display>::fmt::h1e1a1972118942ad
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:44:22
   4:     0x6226c803de2b - core::fmt::rt::Argument::fmt::h07af2b4071d536cd
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/fmt/rt.rs:165:63
   5:     0x6226c803de2b - core::fmt::write::hc090a2ffd6b28c4a
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/fmt/mod.rs:1157:21
   6:     0x6226c800c00f - std::io::Write::write_fmt::h8898bac6ff039a23
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/io/mod.rs:1832:15
   7:     0x6226c800f93e - std::sys_common::backtrace::_print::h4e80c5803d4ee35b
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:47:5
   8:     0x6226c800f93e - std::sys_common::backtrace::print::ha96650907276675e
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:34:9
   9:     0x6226c8010d99 - std::panicking::default_hook::{{closure}}::h215c2a0a8346e0e0
  10:     0x6226c8010add - std::panicking::default_hook::h207342be97478370
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:298:9
  11:     0x6226c80112a3 - std::panicking::rust_panic_with_hook::hac8bdceee1e4fe2c
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:795:13
  12:     0x6226c8011184 - std::panicking::begin_panic_handler::{{closure}}::h00d785e82757ce3c
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:664:13
  13:     0x6226c8010029 - std::sys_common::backtrace::__rust_end_short_backtrace::h1628d957bcd06996
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:171:18
  14:     0x6226c8010eb7 - rust_begin_unwind
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:652:5
  15:     0x6226c803acf3 - core::panicking::panic_fmt::hdc63834ffaaefae5
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/panicking.rs:72:14
  16:     0x6226c803b0ae - core::panicking::assert_failed_inner::hda4754f94c1c1cb1
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/panicking.rs:409:17
  17:     0x6226c7cf845f - core::panicking::assert_failed::h649a82cc51cfb044
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/panicking.rs:364:5
  18:     0x6226c7c9ff37 - wgpu_core::id::RawId::zip::hec6d2da2eb326b53
                               at /home/joseph/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-core-22.1.0/src/id.rs:50:9
  19:     0x6226c7be9b10 - wgpu_core::id::Id<T>::zip::hbcca811fb4ed870d
                               at /home/joseph/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-core-22.1.0/src/id.rs:171:12
  20:     0x6226c7be9b10 - wgpu_core::identity::IdentityValues::alloc::h1b97904fda39e24e
  21:     0x6226c7be9b10 - wgpu_core::identity::IdentityManager<T>::process::h017551fa92e06b31
                               at /home/joseph/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-core-22.1.0/src/identity.rs:107:9
  22:     0x6226c7ba56bc - wgpu_core::registry::Registry<T>::prepare::h788c73050a88a59a
                               at /home/joseph/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-core-22.1.0/src/registry.rs:99:25
  23:     0x6226c7ba56bc - wgpu_core::device::queue::<impl wgpu_core::global::Global>::queue_create_staging_buffer::he2972d22a76e66d3
                               at /home/joseph/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-core-22.1.0/src/device/queue.rs:449:19
  24:     0x6226c7c8879c - <wgpu::backend::wgpu_core::ContextWgpuCore as wgpu::context::Context>::queue_create_staging_buffer::h5871eafe5e3fbf41
                               at /home/joseph/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-22.1.0/src/backend/wgpu_core.rs:2214:15
  25:     0x6226c7c8bb59 - <T as wgpu::context::DynContext>::queue_create_staging_buffer::h8a64aa15e08d06ee
                               at /home/joseph/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-22.1.0/src/context.rs:2994:9
  26:     0x6226c7af18a8 - wgpu::Queue::write_buffer_with::hc01876246eee9544
                               at /home/joseph/.cargo/registry/src/index.crates.io-6f17d22bba15001f/wgpu-22.1.0/src/lib.rs:5454:30
  27:     0x6226c77a294c - <cubecl_wgpu::compute::server::WgpuServer<MM> as cubecl_runtime::server::ComputeServer>::create::hf9ef8fe387450daf
                               at /home/joseph/.cargo/git/checkouts/cubecl-aa41a28b39b598f9/bee7886/crates/cubecl-wgpu/src/compute/server.rs:212:13
  28:     0x6226c76a25c9 - <cubecl_runtime::channel::mutex::MutexComputeChannel<Server> as cubecl_runtime::channel::base::ComputeChannel<Server>>::create::hc5cb12efe10a7882
                               at /home/joseph/.cargo/git/checkouts/cubecl-aa41a28b39b598f9/bee7886/crates/cubecl-runtime/src/channel/mutex.rs:53:9
  29:     0x6226c76a25c9 - cubecl_runtime::client::ComputeClient<Server,Channel>::create::h3c68eb7059e2cd1c
                               at /home/joseph/.cargo/git/checkouts/cubecl-aa41a28b39b598f9/bee7886/crates/cubecl-runtime/src/client.rs:66:9
  30:     0x6226c76a25c9 - cubecl_core::codegen::execution::execute_settings::h4acd522494b4b311
                               at /home/joseph/.cargo/git/checkouts/cubecl-aa41a28b39b598f9/bee7886/crates/cubecl-core/src/codegen/execution.rs:290:23
  31:     0x6226c76a1db2 - cubecl_core::codegen::execution::execute_dynamic::hd392bcd47d3e989f
                               at /home/joseph/.cargo/git/checkouts/cubecl-aa41a28b39b598f9/bee7886/crates/cubecl-core/src/codegen/execution.rs:206:20
  32:     0x6226c770c0b5 - cubecl_core::codegen::execution::Execution<K,R,(&[E],)>::execute::he44daeb471916329
                               at /home/joseph/.cargo/git/checkouts/cubecl-aa41a28b39b598f9/bee7886/crates/cubecl-core/src/codegen/execution.rs:112:9
  33:     0x6226c770c0b5 - burn_jit::kernel::index::slice_assign::slice_assign::h286485b57eab134e
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-jit/src/kernel/index/slice_assign.rs:141:10
  34:     0x6226c7819eb8 - burn_jit::ops::int_ops::<impl burn_tensor::tensor::ops::int_tensor::IntTensorOps<burn_jit::backend::JitBackend<R,F,I>> for burn_jit::backend::JitBackend<R,F,I>>::int_slice_assign::h168b198041f8de54
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-jit/src/ops/int_ops.rs:66:9
  35:     0x6226c7819eb8 - <burn_tensor::tensor::api::kind::Int as burn_tensor::tensor::api::base::BasicOps<B>>::slice_assign::h4eaa0ba2665ac0b9
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-tensor/src/tensor/api/base.rs:1924:9
  36:     0x6226c7819eb8 - burn_tensor::tensor::api::base::Tensor<B,_,K>::slice_assign::hb52c9bbfaf427b66
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-tensor/src/tensor/api/base.rs:679:19
  37:     0x6226c7714fad - burn_tensor::tensor::ops::modules::cat::cat_with_slice_assign::h0d06cc9d4fb6c978
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-tensor/src/tensor/ops/modules/cat.rs:33:25
  38:     0x6226c7831af6 - burn_tensor::tensor::ops::int_tensor::IntTensorOps::int_cat::h417db617652049ea
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-tensor/src/tensor/ops/int_tensor.rs:278:9
  39:     0x6226c7831af6 - <burn_fusion::ops::int::<impl burn_tensor::tensor::ops::int_tensor::IntTensorOps<burn_fusion::backend::Fusion<B>> for burn_fusion::backend::Fusion<B>>::int_cat::CatOps<B,_> as burn_fusion::stream::execution::base::Operation<<B as burn_fusion::backend::FusionBackend>::FusionRuntime>>::execute::hf1bbbcc82dffc798
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-fusion/src/ops/int.rs:506:30
  40:     0x6226c76f8ee6 - burn_fusion::stream::execution::base::<impl burn_fusion::stream::base::OperationQueue<R>>::execute_operations::h799e53f486001ee6
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-fusion/src/stream/execution/base.rs:58:13
  41:     0x6226c76f8ee6 - burn_fusion::stream::execution::base::<impl burn_fusion::stream::base::OperationQueue<R>>::execute::h05d5b66d64105ede
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-fusion/src/stream/execution/base.rs:36:46
  42:     0x6226c76f8ee6 - <burn_fusion::stream::multi::Segment<R> as burn_fusion::stream::execution::processor::StreamSegment<<R as burn_fusion::backend::FusionRuntime>::Optimization>>::execute::h04f93808bbcbdc92
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-fusion/src/stream/multi.rs:146:9
  43:     0x6226c76f53e4 - burn_fusion::stream::execution::processor::Processor<O>::process::ha3056b7c1e2e2f5a
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-fusion/src/stream/execution/processor.rs:71:21
  44:     0x6226c76f4019 - burn_fusion::stream::multi::MultiStream<R>::register::h9d50ee63f080a775
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-fusion/src/stream/multi.rs:51:9
  45:     0x6226c76d170e - burn_fusion::server::FusionServer<R>::register::he6e1a1c050ff6f8b
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-fusion/src/server.rs:30:9
  46:     0x6226c76d170e - <burn_fusion::client::mutex::MutexFusionClient<R> as burn_fusion::client::base::FusionClient<R>>::register::ha452d1ebe6b87173
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-fusion/src/client/mutex.rs:48:14
  47:     0x6226c76d170e - burn_fusion::ops::int::<impl burn_tensor::tensor::ops::int_tensor::IntTensorOps<burn_fusion::backend::Fusion<B>> for burn_fusion::backend::Fusion<B>>::int_cat::h8e540e8505fcdda8
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-fusion/src/ops/int.rs:530:9
  48:     0x6226c781da28 - burn_autodiff::ops::int_tensor::<impl burn_tensor::tensor::ops::int_tensor::IntTensorOps<burn_autodiff::backend::Autodiff<B,C>> for burn_autodiff::backend::Autodiff<B,C>>::int_cat::h1fe9094b5d526e05
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-autodiff/src/ops/int_tensor.rs:63:9
  49:     0x6226c781da28 - <burn_tensor::tensor::api::kind::Int as burn_tensor::tensor::api::base::BasicOps<B>>::cat::h930f6cbeecd6bd27
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-tensor/src/tensor/api/base.rs:1969:9
  50:     0x6226c781da28 - burn_tensor::tensor::api::base::Tensor<B,_,K>::cat::he3ab3b7ade020c91
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-tensor/src/tensor/api/base.rs:775:19
  51:     0x6226c7789bd7 - <taxotangolib::model::TangoBatcher<B> as burn_core::data::dataloader::batcher::Batcher<taxotangolib::model::TaxaDistance<_>,taxotangolib::model::TangoBatch<B>>>::batch::h983835e16d77a25b
                               at /mnt/data/development/taxotango/src/model.rs:217:24
  52:     0x6226c7730cc6 - <burn_core::data::dataloader::batch::BatchDataloaderIterator<I,O> as core::iter::traits::iterator::Iterator>::next::hd56063b4f6fae01b
  53:     0x6226c773a1ca - <alloc::boxed::Box<I,A> as core::iter::traits::iterator::Iterator>::next::hdba816b3ec983080
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/alloc/src/boxed.rs:1956:9
  54:     0x6226c773a1ca - <burn_core::data::dataloader::multithread::MultiThreadDataLoader<O> as burn_core::data::dataloader::base::DataLoader<O>>::iter::{{closure}}::{{closure}}::hc0d15202e4c51e2c
                               at /home/joseph/.cargo/git/checkouts/burn-178c6829f420dae1/58129d1/crates/burn-core/src/data/dataloader/multithread.rs:64:53
  55:     0x6226c773a1ca - std::sys_common::backtrace::__rust_begin_short_backtrace::hfa668fbfef1f2b94
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys_common/backtrace.rs:155:18
  56:     0x6226c773c5a5 - std::thread::Builder::spawn_unchecked_::{{closure}}::{{closure}}::hde37980062f244db
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/thread/mod.rs:542:17
  57:     0x6226c773c5a5 - <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once::h1a272682b6ffed44
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/panic/unwind_safe.rs:272:9
  58:     0x6226c773c5a5 - std::panicking::try::do_call::ha187b82e101f9b62
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:559:40
  59:     0x6226c773c5a5 - std::panicking::try::h0d99e57983f49568
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panicking.rs:523:19
  60:     0x6226c773c5a5 - std::panic::catch_unwind::hb8fab7762983e8ef
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/panic.rs:149:14
  61:     0x6226c773c5a5 - std::thread::Builder::spawn_unchecked_::{{closure}}::h1008ce57a16fa979
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/thread/mod.rs:541:30
  62:     0x6226c773c5a5 - core::ops::function::FnOnce::call_once{{vtable.shim}}::h0747bd60cd55a715
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/ops/function.rs:250:5
  63:     0x6226c801658b - <alloc::boxed::Box<F,A> as core::ops::function::FnOnce<Args>>::call_once::h09e5a4c541afa800
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/alloc/src/boxed.rs:2022:9
  64:     0x6226c801658b - <alloc::boxed::Box<F,A> as core::ops::function::FnOnce<Args>>::call_once::h9c8b03c22f4e7026
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/alloc/src/boxed.rs:2022:9
  65:     0x6226c801658b - std::sys::pal::unix::thread::Thread::new::thread_start::h522bc89a54da820a
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/sys/pal/unix/thread.rs:108:17
  66:     0x7cb5b129ca94 - start_thread
                               at ./nptl/pthread_create.c:447:8
  67:     0x7cb5b1329c3c - __GI___clone3
                               at ./misc/../sysdeps/unix/sysv/linux/x86_64/clone3.S:78
  68:                0x0 - <unknown>

Desktop (please complete the following information): Linux - Description: Ubuntu 24.04 LTS rustc 1.79.0 (129f3b996 2024-06-10)

Additional context None, but let me know what I can do to help! Will keep trying variations, and updating. It takes ~4 hours on my 2080 to get to the problem.

jguhlin commented 3 weeks ago

I've confirmed this does not happen with torch on GPU as the backend.