RWKV is an RNN with transformer-level LLM performance. It can be directly trained like a GPT (parallelizable). So it's combining the best of RNN and transformer - great performance, fast inference, saves VRAM, fast training, "infinite" ctx_len, and free sentence embedding.
ninja: no work to do.
Loading extension module fused_adam...
Time to load fused_adam op: 0.4050893783569336 seconds
Using /home/test/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
Emitting ninja build file /home/test/.cache/torch_extensions/py38_cu117/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module utils...
Time to load utils op: 0.3920328617095947 seconds
Rank: 0 partition count [1, 1, 1] and sizes[(6827008, False), (1024, False), (1024, False)]
Traceback (most recent call last):
File "train.py", line 344, in
trainer.fit(model, data_loader)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 608, in fit
call._call_and_handle_interrupt(
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 36, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, *kwargs)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 88, in launch
return function(args, **kwargs)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1093, in _run
self.strategy.setup(self)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/strategies/deepspeed.py", line 345, in setup
self.init_deepspeed()
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/strategies/deepspeed.py", line 456, in init_deepspeed
self._initialize_deepspeed_train(model)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/strategies/deepspeed.py", line 493, in _initialize_deepspeed_train
model, deepspeed_optimizer = self._setup_model_and_optimizer(model, optimizer, scheduler)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/strategies/deepspeed.py", line 414, in _setup_model_and_optimizer
deepspeed_engine, deepspeedoptimizer, , _ = deepspeed.initialize(
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/init.py", line 165, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 309, in init
self._configure_optimizer(optimizer, model_parameters)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1185, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1420, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer(
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 494, in init
self._link_all_hp_params()
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 524, in _link_all_hp_params
link_hp_params(lp_param_list=self.bit16_groups[i],
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/utils/mixed_precision_linkage.py", line 15, in link_hp_params
lp_param._hp_mapping = get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict,
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/utils/tensor_fragment.py", line 156, in get_hp_fragment_mapping
assert fragment_start < fragment_end, \
AssertionError: fragment start 6825984 should be < fragment_end 6825984
用以下命令训练: CUDA_VISIBLE_DEVICES=0 python train.py --load_model "" --wandb "" --proj_dir "./output" --data_file "dataset/rwkv_text_document" --data_type binidx --vocab_size 0 --ctx_len 512 --epoch_steps 5000 --epoch_count 500 --epoch_begin 0 --epoch_save 5 --micro_bsz 1 --n_layer 2 --n_embd 512 --pre_ffn 0 --head_qk 0 --lr_init 8e-4 --lr_final 1e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.99 --adam_eps 1e-8 --accelerator gpu --devices 1 --precision bf16 --strategy deepspeed_stage_2 --grad_cp 0
服务器配置:1*RTX3090
请教各位大佬,报错(如下)怎么解决:
ninja: no work to do. Loading extension module fused_adam... Time to load fused_adam op: 0.4050893783569336 seconds Using /home/test/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... Emitting ninja build file /home/test/.cache/torch_extensions/py38_cu117/utils/build.ninja... Building extension module utils... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) ninja: no work to do. Loading extension module utils... Time to load utils op: 0.3920328617095947 seconds Rank: 0 partition count [1, 1, 1] and sizes[(6827008, False), (1024, False), (1024, False)] Traceback (most recent call last): File "train.py", line 344, in
trainer.fit(model, data_loader)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 608, in fit
call._call_and_handle_interrupt(
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 36, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, *kwargs)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 88, in launch
return function(args, **kwargs)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1093, in _run
self.strategy.setup(self)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/strategies/deepspeed.py", line 345, in setup
self.init_deepspeed()
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/strategies/deepspeed.py", line 456, in init_deepspeed
self._initialize_deepspeed_train(model)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/strategies/deepspeed.py", line 493, in _initialize_deepspeed_train
model, deepspeed_optimizer = self._setup_model_and_optimizer(model, optimizer, scheduler)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/pytorch_lightning/strategies/deepspeed.py", line 414, in _setup_model_and_optimizer
deepspeed_engine, deepspeedoptimizer, , _ = deepspeed.initialize(
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/init.py", line 165, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 309, in init
self._configure_optimizer(optimizer, model_parameters)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1185, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1420, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer(
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 494, in init
self._link_all_hp_params()
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 524, in _link_all_hp_params
link_hp_params(lp_param_list=self.bit16_groups[i],
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/utils/mixed_precision_linkage.py", line 15, in link_hp_params
lp_param._hp_mapping = get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict,
File "/home/test/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed/utils/tensor_fragment.py", line 156, in get_hp_fragment_mapping
assert fragment_start < fragment_end, \
AssertionError: fragment start 6825984 should be < fragment_end 6825984