Closed hopeforus closed 1 year ago
(cpmbee) hope@hope-08:~/work/CPM-Bee/src$ bash scripts/finetune_cpm_bee.sh torchrun --nnodes=1 --nproc_per_node=1 --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=localhost:12345 finetune_cpm_bee.py --use-delta --model-config config/cpm-bee-10b.json --dataset path/to/dataset --eval_dataset path/to/eval/dataset --epoch 100 --batch-size 5 --train-iters 100 --save-name cpm_bee_finetune --max-length 2048 --save results/ --lr 0.0001 --inspect-iters 100 --warmup-iters 1 --eval-interval 1000 --early-stop-patience 5 --lr-decay-style noam --weight-decay 0.01 --clip-grad 1.0 --loss-scale 32768 --start-step 0 --load model.pt ====================== Initialization ====================== rank : 0 local_rank : 0 world_size : 1 local_size : 1 master : hope-08:46111 device : 0 cpus : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1 3, 14, 15, 16, 17, 18, 19]
Failures:
(cpmbee) hope@hope-08:~/work/CPM-Bee/src$ bash scripts/finetune_cpm_bee.sh torchrun --nnodes=1 --nproc_per_node=1 --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=localhost:12345 finetune_cpm_bee.py --use-delta --model-config config/cpm-bee-10b.json --dataset path/to/dataset --eval_dataset path/to/eval/dataset --epoch 100 --batch-size 5 --train-iters 100 --save-name cpm_bee_finetune --max-length 2048 --save results/ --lr 0.0001 --inspect-iters 100 --warmup-iters 1 --eval-interval 1000 --early-stop-patience 5 --lr-decay-style noam --weight-decay 0.01 --clip-grad 1.0 --loss-scale 32768 --start-step 0 --load model.pt ====================== Initialization ====================== rank : 0 local_rank : 0 world_size : 1 local_size : 1 master : hope-08:46111 device : 0 cpus : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1 3, 14, 15, 16, 17, 18, 19]
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/hope/work/CPM-Bee/src/finetune_cpm_bee.py:431 in │
│ │
│ 428 │
│ 429 │
│ 430 if name == "main": │
│ ❱ 431 │ main() │
│ 432 │
│ │
│ /home/hope/work/CPM-Bee/src/finetune_cpm_bee.py:426 in main │
│ │
│ 423 │
│ 424 def main(): │
│ 425 │ args = initialize() │
│ ❱ 426 │ tokenizer, model, optimizer, lr_scheduler, optim_manager = setup_model_and_optimizer │
│ 427 │ finetune(args, tokenizer, model, optimizer, lr_scheduler, optim_manager) │
│ 428 │
│ 429 │
│ │
│ /home/hope/work/CPM-Bee/src/finetune_cpm_bee.py:74 in setup_model_and_optimizer │
│ │
│ 71 │
│ 72 │
│ 73 def setup_model_and_optimizer(args): │
│ ❱ 74 │ model = get_model(args) │
│ 75 │ tokenizer = get_tokenizer(args) │
│ 76 │ bmt.synchronize() │
│ 77 │ optimizer = get_optimizer(args, model) │
│ │
│ /home/hope/work/CPM-Bee/src/finetune_cpm_bee.py:40 in get_model │
│ │
│ 37 │ model = CPMBee(config) │
│ 38 │ model.config = config │
│ 39 │ if args.load is not None: │
│ ❱ 40 │ │ bmt.load(model, args.load) │
│ 41 │ else: │
│ 42 │ │ bmt.init_parameters(model) │
│ 43 │ # insert LoRA │
│ │
│ /home/hope/miniconda3/envs/cpmbee/lib/python3.10/site-packages/bmtrain/store.py:223 in load │
│ │
│ 220 │ │ >>> bmtrain.load(model, "model.pt", strict=True) │
│ 221 │ """ │
│ 222 │ if config['rank'] == 0: │
│ ❱ 223 │ │ state_dict = DistributedStateDictWrapper(torch.load(file_name)) │
│ 224 │ else: │
│ 225 │ │ state_dict = DistributedStateDictWrapper({}) │
│ 226 │
│ │
│ /home/hope/miniconda3/envs/cpmbee/lib/python3.10/site-packages/torch/serialization.py:771 in │
│ load │
│ │
│ 768 │ if 'encoding' not in pickle_load_args.keys(): │
│ 769 │ │ pickle_load_args['encoding'] = 'utf-8' │
│ 770 │ │
│ ❱ 771 │ with _open_file_like(f, 'rb') as opened_file: │
│ 772 │ │ if _is_zipfile(opened_file): │
│ 773 │ │ │ # The zipfile reader is going to advance the current file position. │
│ 774 │ │ │ # If we want to actually tail call to torch.jit.load, we need to │
│ │
│ /home/hope/miniconda3/envs/cpmbee/lib/python3.10/site-packages/torch/serialization.py:270 in │
│ _open_file_like │
│ │
│ 267 │
│ 268 def _open_file_like(name_or_buffer, mode): │
│ 269 │ if _is_path(name_or_buffer): │
│ ❱ 270 │ │ return _open_file(name_or_buffer, mode) │
│ 271 │ else: │
│ 272 │ │ if 'w' in mode: │
│ 273 │ │ │ return _open_buffer_writer(name_or_buffer) │
│ │
│ /home/hope/miniconda3/envs/cpmbee/lib/python3.10/site-packages/torch/serialization.py:251 in │
│ init │
│ │
│ 248 │
│ 249 class _open_file(_opener): │
│ 250 │ def init(self, name, mode): │
│ ❱ 251 │ │ super(_open_file, self).init(open(name, mode)) │
│ 252 │ │
│ 253 │ def exit(self, args): │
│ 254 │ │ self.file_like.close() │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
FileNotFoundError: [Errno 2] No such file or directory: 'model.pt'
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2004662) of binary: /home/hope/miniconda3/envs/cpmbee/bin/python
Traceback (most recent call last):
File "/home/hope/miniconda3/envs/cpmbee/bin/torchrun", line 8, in
sys.exit(main())
File "/home/hope/miniconda3/envs/cpmbee/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f( args, **kwargs)
File "/home/hope/miniconda3/envs/cpmbee/lib/python3.10/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/home/hope/miniconda3/envs/cpmbee/lib/python3.10/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/home/hope/miniconda3/envs/cpmbee/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/hope/miniconda3/envs/cpmbee/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
finetune_cpm_bee.py FAILED
Failures: