torchrun --standalone --nnodes=1 --nproc-per-node=4 utils/merge_model.py --version base --from_pretrained /mnt/cache/huangzhiyuan/thudm/CogVLM-photograph/checkpoints/finetune-cogvlm-base-490-04-10-12-50
Traceback (most recent call last):
File "/mnt/cache/huangzhiyuan/thudm/CogVLM-photograph/utils/merge_model.py", line 42, in
main()
File "/mnt/cache/huangzhiyuan/thudm/CogVLM-photograph/utils/merge_model.py", line 23, in main
model, model_args = FineTuneTestCogVLMModel.from_pretrained(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/sat/model/base_model.py", line 257, in from_pretrained
mp_merge_model_rank0(model, model_full)
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/sat/mpu/operation.py", line 112, in mp_merge_model_rank0
iter_merge(model, model_full)
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/sat/mpu/operation.py", line 111, in iter_merge
iter_merge(sub_new_model, sub_module)
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/sat/mpu/operation.py", line 111, in iter_merge
iter_merge(sub_new_model, sub_module)
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/sat/mpu/operation.py", line 111, in iter_merge
iter_merge(sub_new_model, sub_module)
[Previous line repeated 5 more times]
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/sat/mpu/operation.py", line 110, in itermerge
p.data.copy(torch.clone(np.data.cpu()).detach())
RuntimeError: The size of tensor a (1792) must match the size of tensor b (448) at non-singleton dimension 0
[2024-04-10 22:28:49,072] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 1042137 closing signal SIGTERM
[2024-04-10 22:28:49,072] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 1042138 closing signal SIGTERM
[2024-04-10 22:28:49,073] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 1042139 closing signal SIGTERM
[2024-04-10 22:28:49,964] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1042136) of binary: /mnt/cache/huangzhiyuan/env/thudm/bin/python
Traceback (most recent call last):
File "/mnt/cache/huangzhiyuan/env/thudm/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==2.2.0', 'console_scripts', 'torchrun')())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 347, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/torch/distributed/run.py", line 812, in main
run(args)
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/torch/distributed/run.py", line 803, in run
elastic_launch(
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 135, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
System Info / 系統信息
4x A800
Who can help? / 谁可以帮助到您?
@1049451037
Information / 问题信息
Reproduction / 复现过程
torchrun --standalone --nnodes=1 --nproc-per-node=4 utils/merge_model.py --version base --from_pretrained /mnt/cache/huangzhiyuan/thudm/CogVLM-photograph/checkpoints/finetune-cogvlm-base-490-04-10-12-50
Traceback (most recent call last): File "/mnt/cache/huangzhiyuan/thudm/CogVLM-photograph/utils/merge_model.py", line 42, in
main()
File "/mnt/cache/huangzhiyuan/thudm/CogVLM-photograph/utils/merge_model.py", line 23, in main
model, model_args = FineTuneTestCogVLMModel.from_pretrained(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/sat/model/base_model.py", line 257, in from_pretrained
mp_merge_model_rank0(model, model_full)
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/sat/mpu/operation.py", line 112, in mp_merge_model_rank0
iter_merge(model, model_full)
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/sat/mpu/operation.py", line 111, in iter_merge
iter_merge(sub_new_model, sub_module)
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/sat/mpu/operation.py", line 111, in iter_merge
iter_merge(sub_new_model, sub_module)
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/sat/mpu/operation.py", line 111, in iter_merge
iter_merge(sub_new_model, sub_module)
[Previous line repeated 5 more times]
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/sat/mpu/operation.py", line 110, in itermerge
p.data.copy(torch.clone(np.data.cpu()).detach())
RuntimeError: The size of tensor a (1792) must match the size of tensor b (448) at non-singleton dimension 0
[2024-04-10 22:28:49,072] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 1042137 closing signal SIGTERM
[2024-04-10 22:28:49,072] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 1042138 closing signal SIGTERM
[2024-04-10 22:28:49,073] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 1042139 closing signal SIGTERM
[2024-04-10 22:28:49,964] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1042136) of binary: /mnt/cache/huangzhiyuan/env/thudm/bin/python
Traceback (most recent call last):
File "/mnt/cache/huangzhiyuan/env/thudm/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==2.2.0', 'console_scripts', 'torchrun')())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 347, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/torch/distributed/run.py", line 812, in main
run(args)
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/torch/distributed/run.py", line 803, in run
elastic_launch(
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 135, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/cache/huangzhiyuan/env/thudm/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
utils/merge_model.py FAILED
Failures: