felix-andreas / gpt-burn

Implementation of the GPT architecture in Rust 🦀 + Burn 🔥
Universal Permissive License v1.0
42 stars 2 forks source link

runtime panic while training #2

Open miaomiao1992 opened 1 month ago

miaomiao1992 commented 1 month ago
dev` profile [unoptimized + debuginfo] target(s) in 3m 33s
     Running `D:/rust-target\debug\gpt-burn.exe train --context-length 128 --n-layers 12 --n-heads 12 --d-model 768 --batch-size 128 --learning-rate 0.0003 --seed 0 --text-corpus corpus.txt`
load entire file "corpus.txt" as dataset
start training - parameters: 85277288
{
  "n_steps": 50,
  "batch_size": 128,
  "learning_rate": 0.0003,
  "batches_per_step": 100,
  "validation_size": 128,
  "seed": 0,
  "model": {
    "context_length": 128,
    "vocab_size": 104,
    "n_layers": 12,
    "n_heads": 12,
    "d_model": 768,
    "d_hidden": 3072,
    "dropout": 0.2
  },
  "optimizer": {
    "grad_clipping": null,
    "beta_1": 0.9,
    "beta_2": 0.999,
    "epsilon": 0.00001,
    "weight_decay": 0.0001
  }
}
thread 'main' panicked at D:\gpt-burn-main\gpt-burn-main-59d98fd83cedefb07da861cda5260e216edf4697\src\train.rs:133:35:
attempt to subtract with overflow
stack backtrace:
   0:     0x7ff744ccb93a - std::backtrace_rs::backtrace::dbghelp64::trace
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\..\..\backtrace\src\backtrace\dbghelp64.rs:99
   1:     0x7ff744ccb93a - std::backtrace_rs::backtrace::trace_unsynchronized
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\..\..\backtrace\src\backtrace\mod.rs:66
   2:     0x7ff744ccb93a - std::sys_common::backtrace::_print_fmt
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\sys_common\backtrace.rs:68
   3:     0x7ff744ccb93a - std::sys_common::backtrace::_print::impl$0::fmt
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\sys_common\backtrace.rs:44
   4:     0x7ff744ce520b - core::fmt::rt::Argument::fmt
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\core\src\fmt\rt.rs:142
   5:     0x7ff744ce520b - core::fmt::write
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\core\src\fmt\mod.rs:1153
   6:     0x7ff744cc7c01 - std::io::Write::write_fmt<std::sys::pal::windows::stdio::Stderr>
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\io\mod.rs:1843
   7:     0x7ff744ccb726 - std::sys_common::backtrace::print
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\sys_common\backtrace.rs:34
   8:     0x7ff744ccd8cf - std::panicking::default_hook::closure$1
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\panicking.rs:272
   9:     0x7ff744ccd567 - std::panicking::default_hook
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\panicking.rs:292
  10:     0x7ff744ccde0d - std::panicking::rust_panic_with_hook
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\panicking.rs:779
  11:     0x7ff744ccdc8b - std::panicking::begin_panic_handler::closure$0
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\panicking.rs:649
  12:     0x7ff744ccbfc9 - std::sys_common::backtrace::__rust_end_short_backtrace<std::panicking::begin_panic_handler::closure_env$0,never$>
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\sys_common\backtrace.rs:171
  13:     0x7ff744ccd986 - std::panicking::begin_panic_handler
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\panicking.rs:645
  14:     0x7ff744e489d7 - core::panicking::panic_fmt
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\core\src\panicking.rs:72
  15:     0x7ff744e48ab2 - core::panicking::panic
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\core\src\panicking.rs:145
  16:     0x7ff743b685d3 - gpt_burn::train::get_batch::closure$0<burn_fusion::backend::Fusion<burn_jit::backend::JitBackend<burn_wgpu::runtime::WgpuRuntime<burn_wgpu::graphics::AutoGraphicsApi,f32,i32> > >,rand::rngs::std::StdRng>
                               at D:\gpt-burn-main\gpt-burn-main-59d98fd83cedefb07da861cda5260e216edf4697\src\train.rs:133
  17:     0x7ff743d6dbbf - core::iter::adapters::map::map_fold::closure$0<usize,usize,tuple$<>,gpt_burn::train::get_batch::closure_env$0<burn_fusion::backend::Fusion<burn_jit::backend::JitBackend<burn_wgpu::runtime::WgpuRuntime<burn_wgpu::graphics::AutoGraphicsApi,f32,i32> > >,rand
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\core\src\iter\adapters\map.rs:89
  18:     0x7ff743e2d43f - core::iter::traits::iterator::Iterator::fold<core::ops::range::Range<usize>,tuple$<>,core::iter::adapters::map::map_fold::closure_env$0<usize,usize,tuple$<>,gpt_burn::train::get_batch::closure_env$0<burn_fusion::backend::Fusion<burn_jit::backend::JitBacke
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\core\src\iter\traits\iterator.rs:2587
  19:     0x7ff743d65bee - core::iter::adapters::map::impl$2::fold<usize,core::ops::range::Range<usize>,gpt_burn::train::get_batch::closure_env$0<burn_fusion::backend::Fusion<burn_jit::backend::JitBackend<burn_wgpu::runtime::WgpuRuntime<burn_wgpu::graphics::AutoGraphicsApi,f32,i32>
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\core\src\iter\adapters\map.rs:129
  20:     0x7ff743d6bc98 - core::iter::traits::iterator::Iterator::for_each<core::iter::adapters::map::Map<core::ops::range::Range<usize>,gpt_burn::train::get_batch::closure_env$0<burn_fusion::backend::Fusion<burn_jit::backend::JitBackend<burn_wgpu::runtime::WgpuRuntime<burn_wgpu::
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\core\src\iter\traits\iterator.rs:817
  21:     0x7ff743c10f5b - alloc::vec::Vec<usize,alloc::alloc::Global>::extend_trusted<usize,alloc::alloc::Global,core::iter::adapters::map::Map<core::ops::range::Range<usize>,gpt_burn::train::get_batch::closure_env$0<burn_fusion::backend::Fusion<burn_jit::backend::JitBackend<burn_
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\alloc\src\vec\mod.rs:3020
  22:     0x7ff743c21b9e - alloc::vec::spec_extend::impl$1::spec_extend<usize,core::iter::adapters::map::Map<core::ops::range::Range<usize>,gpt_burn::train::get_batch::closure_env$0<burn_fusion::backend::Fusion<burn_jit::backend::JitBackend<burn_wgpu::runtime::WgpuRuntime<burn_wgpu
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\alloc\src\vec\spec_extend.rs:26
  23:     0x7ff743c094a3 - alloc::vec::spec_from_iter_nested::impl$1::from_iter<usize,core::iter::adapters::map::Map<core::ops::range::Range<usize>,gpt_burn::train::get_batch::closure_env$0<burn_fusion::backend::Fusion<burn_jit::backend::JitBackend<burn_wgpu::runtime::WgpuRuntime<b
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\alloc\src\vec\spec_from_iter_nested.rs:62
  24:     0x7ff743c220d1 - alloc::vec::spec_from_iter::impl$0::from_iter<usize,core::iter::adapters::map::Map<core::ops::range::Range<usize>,gpt_burn::train::get_batch::closure_env$0<burn_fusion::backend::Fusion<burn_jit::backend::JitBackend<burn_wgpu::runtime::WgpuRuntime<burn_wgp
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\alloc\src\vec\spec_from_iter.rs:33
  25:     0x7ff743c21527 - alloc::vec::impl$14::from_iter<usize,core::iter::adapters::map::Map<core::ops::range::Range<usize>,gpt_burn::train::get_batch::closure_env$0<burn_fusion::backend::Fusion<burn_jit::backend::JitBackend<burn_wgpu::runtime::WgpuRuntime<burn_wgpu::graphics::Au
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\alloc\src\vec\mod.rs:2894
  26:     0x7ff743d6b8e1 - core::iter::traits::iterator::Iterator::collect<core::iter::adapters::map::Map<core::ops::range::Range<usize>,gpt_burn::train::get_batch::closure_env$0<burn_fusion::backend::Fusion<burn_jit::backend::JitBackend<burn_wgpu::runtime::WgpuRuntime<burn_wgpu::g
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\core\src\iter\traits\iterator.rs:2003
  27:     0x7ff743b6793f - gpt_burn::train::get_batch<burn_fusion::backend::Fusion<burn_jit::backend::JitBackend<burn_wgpu::runtime::WgpuRuntime<burn_wgpu::graphics::AutoGraphicsApi,f32,i32> > >,rand::rngs::std::StdRng>
                               at D:\gpt-burn-main\gpt-burn-main-59d98fd83cedefb07da861cda5260e216edf4697\src\train.rs:132
  28:     0x7ff743b6583e - gpt_burn::train::train<burn_autodiff::backend::Autodiff<burn_fusion::backend::Fusion<burn_jit::backend::JitBackend<burn_wgpu::runtime::WgpuRuntime<burn_wgpu::graphics::AutoGraphicsApi,f32,i32> > >,burn_autodiff::checkpoint::strategy::NoCheckpointing> >
                               at D:\gpt-burn-main\gpt-burn-main-59d98fd83cedefb07da861cda5260e216edf4697\src\train.rs:55
  29:     0x7ff743aad1f8 - gpt_burn::main
                               at D:\gpt-burn-main\gpt-burn-main-59d98fd83cedefb07da861cda5260e216edf4697\src\main.rs:137
  30:     0x7ff743cc992b - core::ops::function::FnOnce::call_once<void (*)(),tuple$<> >
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\core\src\ops\function.rs:250
  31:     0x7ff743a5780e - core::hint::black_box
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\core\src\hint.rs:337
  32:     0x7ff743a5780e - std::sys_common::backtrace::__rust_begin_short_backtrace<void (*)(),tuple$<> >
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\std\src\sys_common\backtrace.rs:155
  33:     0x7ff743d59da1 - std::rt::lang_start::closure$0<tuple$<> >
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\std\src\rt.rs:166
  34:     0x7ff744cc17b2 - std::rt::lang_start_internal
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library\std\src\rt.rs:148
  35:     0x7ff743d59d7a - std::rt::lang_start<tuple$<> >
                               at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6\library\std\src\rt.rs:165
  36:     0x7ff743ab4219 - main
  37:     0x7ff744e46350 - invoke_main
                               at D:\a\_work\1\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl:78
  38:     0x7ff744e46350 - __scrt_common_main_seh
                               at D:\a\_work\1\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl:288
  39:     0x7ffa27407034 - BaseThreadInitThunk
  40:     0x7ffa2865d0d1 - RtlUserThreadStart
error: process didn't exit successfully: `D:/rust-target\debug\gpt-burn.exe train --context-length 128 --n-layers 12 --n-heads 12 --d-model 768 --batch-size 128 --learning-rate 0.0003 --seed 0 --text-corpus corpus.txt` (exit code: 101)

corpus.txt is following:

你是谁? 我是Jack。

你今年几岁? 我今年32岁。

你女儿是谁? 我女儿是小圆圆。

felix-andreas commented 1 month ago

The issue is that I have only implemented a simple tokenizer which supports Latin alphabet and filter every other character. The the resulting text corpus becomes: "\nJack\n\n\n32\n\n\n\n" which is shorter than the context length leading to the underflow ...

The solution would be a more sophisticated tokenizer: Just allowing all Unicode characters would probably blow up the vocab size to much. Maybe I will implement a BPE-based tokenizer as used by GPT-2 in the future, which also supports arbitrary unicode characters