Closed listwebit closed 5 months ago
CUDA_VISIBLE_DEVICES=0,1,2,3 python pretraining.py
CUDA_VISIBLE_DEVICES=0,1,2,3 python pretraining.py
已经设置了,依然报的同样的错误;
Traceback (most recent call last):
File "pretraining.py", line 767, in <module>
main()
File "pretraining.py", line 728, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/transformers/trainer.py", line 1672, in _inner_training_loop
model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1227, in prepare
result = tuple(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1228, in <genexpr>
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1104, in _prepare_one
return self.prepare_model(obj, device_placement=device_placement)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1355, in prepare_model
model = torch.nn.parallel.DistributedDataParallel(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 809, in __init__
self._ddp_init_helper(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1098, in _ddp_init_helper
self.reducer = dist.Reducer(
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 64.05 GiB. GPU 1 has a total capacty of 79.33 GiB of which 11.05 GiB is free. Process 2993049 has 3.10 GiB memory in use. Including non-PyTorch memory, this process has 65.17 GiB memory in use. Of the allocated memory 64.17 GiB is allocated by PyTorch, and 323.50 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Traceback (most recent call last):
File "pretraining.py", line 767, in <module>
main()
File "pretraining.py", line 728, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/transformers/trainer.py", line 1672, in _inner_training_loop
model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1227, in prepare
result = tuple(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1228, in <genexpr>
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1104, in _prepare_one
return self.prepare_model(obj, device_placement=device_placement)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1355, in prepare_model
model = torch.nn.parallel.DistributedDataParallel(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 809, in __init__
self._ddp_init_helper(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1098, in _ddp_init_helper
self.reducer = dist.Reducer(
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 64.05 GiB. GPU 2 has a total capacty of 79.33 GiB of which 14.15 GiB is free. Including non-PyTorch memory, this process has 65.17 GiB memory in use. Of the allocated memory 64.17 GiB is allocated by PyTorch, and 323.50 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Traceback (most recent call last):
File "pretraining.py", line 767, in <module>
main()
File "pretraining.py", line 728, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/transformers/trainer.py", line 1672, in _inner_training_loop
model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1227, in prepare
result = tuple(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1228, in <genexpr>
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1104, in _prepare_one
Traceback (most recent call last):
File "pretraining.py", line 767, in <module>
return self.prepare_model(obj, device_placement=device_placement)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1355, in prepare_model
model = torch.nn.parallel.DistributedDataParallel(
main() File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 809, in __init__
File "pretraining.py", line 728, in main
self._ddp_init_helper(
train_result = trainer.train(resume_from_checkpoint=checkpoint) File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1098, in _ddp_init_helper
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/transformers/trainer.py", line 1537, in train
self.reducer = dist.Reducer(
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 64.05 GiB. GPU 3 has a total capacty of 79.33 GiB of which 14.24 GiB is free. Including non-PyTorch memory, this process has 65.07 GiB memory in use. Of the allocated memory 64.17 GiB is allocated by PyTorch, and 323.50 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
return inner_training_loop(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/transformers/trainer.py", line 1672, in _inner_training_loop
model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1227, in prepare
result = tuple(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1228, in <genexpr>
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1104, in _prepare_one
return self.prepare_model(obj, device_placement=device_placement)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/accelerate/accelerator.py", line 1355, in prepare_model
model = torch.nn.parallel.DistributedDataParallel(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 809, in __init__
self._ddp_init_helper(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1098, in _ddp_init_helper
self.reducer = dist.Reducer(
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 64.05 GiB. GPU 0 has a total capacty of 79.33 GiB of which 14.24 GiB is free. Including non-PyTorch memory, this process has 65.07 GiB memory in use. Of the allocated memory 64.17 GiB is allocated by PyTorch, and 323.50 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
[2024-01-23 15:22:18,696] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 3272628) of binary: /home/centos/anaconda3/envs/cpt/bin/python
Traceback (most recent call last):
File "/home/centos/anaconda3/envs/cpt/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
pretraining.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2024-01-23_15:22:18
host : host188
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 3272629)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-01-23_15:22:18
host : host188
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 3272630)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-01-23_15:22:18
host : host188
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 3272631)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-01-23_15:22:18
host : host188
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 3272628)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
Describe the bug
run_pt脚本如下:
CUDA_VISIBLE_DEVICES=2,3 torchrun --nproc_per_node 2 pretraining.py \ --model_type auto \ --model_name_or_path ../Yi-34B-Chat \ --train_file_dir ./data/pretrain \ --validation_file_dir ./data/pretrain \ --per_device_train_batch_size 2 \ --per_device_eval_batch_size 2 \ --do_train \ --do_eval \ --use_peft False \ --seed 42 \ --max_train_samples 10000 \ --max_eval_samples 10 \ --num_train_epochs 0.5 \ --learning_rate 2e-4 \ --warmup_ratio 0.05 \ --weight_decay 0.01 \ --logging_strategy steps \ --logging_steps 10 \ --eval_steps 100 \ --evaluation_strategy steps \ --save_steps 500 \ --save_strategy steps \ --save_total_limit 13 \ --gradient_accumulation_steps 1 \ --preprocessing_num_workers 10 \ --block_size 16 \ --group_by_length True \ --output_dir outputs-pt-Yi-v3 \ --overwrite_output_dir \ --ddp_timeout 30000 \ --logging_first_step True \ --target_modules all \ --torch_dtype bfloat16 \ --bf16 \ --device_map auto \ --report_to tensorboard \ --ddp_find_unused_parameters False \ --gradient_checkpointing True \ --cache_dir ./cache 报错如下:
另外请问一下: --block_size 的作用是什么呢,通过batch_size可以调解显存使用,为啥还需要这个参数呢?