Open string-new opened 1 year ago
warnings.warn( ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/zero_nlp/Chatglm6b_ModelParallel/train_model_all.py:321 in <module> │ │ │ │ 318 │ train_dataset=tokenized_datasets["train"], │ │ 319 │ eval_dataset=tokenized_datasets["valid"], │ │ 320 ) │ │ ❱ 321 trainer.train() │ │ 322 │ │ │ │ /home/zero_nlp/Chatglm6b_ModelParallel/MyTrainer.py:1629 in train │ │ │ │ 1626 │ │ inner_training_loop = find_executable_batch_size( │ │ 1627 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1628 │ │ ) │ │ ❱ 1629 │ │ return inner_training_loop( │ │ 1630 │ │ │ args=args, │ │ 1631 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1632 │ │ │ trial=trial, │ │ │ │ /home/zero_nlp/Chatglm6b_ModelParallel/MyTrainer.py:1716 in _inner_training_loop │ │ │ │ 1713 │ │ if args.gradient_checkpointing: │ │ 1714 │ │ │ self.model.gradient_checkpointing_enable() │ │ 1715 │ │ │ │ ❱ 1716 │ │ model = self._wrap_model(self.model_wrapped) │ │ 1717 │ │ │ │ 1718 │ │ if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None: │ │ 1719 │ │ │ self._load_from_checkpoint(resume_from_checkpoint, model) │ │ │ │ /home/zero_nlp/Chatglm6b_ModelParallel/MyTrainer.py:1541 in _wrap_model │ │ │ │ 1538 │ │ │ │ kwargs["bucket_cap_mb"] = self.args.ddp_bucket_cap_mb │ │ 1539 │ │ │ if is_torch_neuroncore_available(): │ │ 1540 │ │ │ │ return model │ │ ❱ 1541 │ │ │ model = nn.parallel.DistributedDataParallel( │ │ 1542 │ │ │ │ model.cuda(), │ │ 1543 │ │ │ │ device_ids=[self.args.local_rank] if self.args._n_gpu != 0 else None, │ │ 1544 │ │ │ │ output_device=self.args.local_rank if self.args._n_gpu != 0 else None, │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/torch/nn/parallel/distributed.py:625 in __init__ │ │ │ │ 622 │ │ │ self.output_device = _get_device_index(output_device, True) │ │ 623 │ │ │ │ 624 │ │ if process_group is None: │ │ ❱ 625 │ │ │ self.process_group = _get_default_group() │ │ 626 │ │ else: │ │ 627 │ │ │ self.process_group = process_group │ │ 628 │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:707 in │ │ _get_default_group │ │ │ │ 704 │ Getting the default process group created by init_process_group │ │ 705 │ """ │ │ 706 │ if not is_initialized(): │ │ ❱ 707 │ │ raise RuntimeError( │ │ 708 │ │ │ "Default process group has not been initialized, " │ │ 709 │ │ │ "please make sure to call init_process_group." │ │ 710 │ │ ) │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.
楼主解决了吗 我也是这样呢
你是不是修改了参数?我代码里面写的都是模型并行,但是看你的错误,感觉像是数据并行导致的问题。务必仔细看我写的readme.md和注意事项
readme.md