# nvidia-smi 信息:
[shibingli@loaclhost ~]$ sudo docker run --gpus=all --runtime=nvidia --rm -it -v /data/:/data/ -v /data/agi/Chinese-LLaMA-Alpaca-Docker/envs/:/opt/app/envs/ rl-agi:latest bash
[sudo] shibingli 的密码:
==========
== CUDA ==
==========
CUDA Version 11.8.0
Container image Copyright (c) 2016-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
This container image and its contents are governed by the NVIDIA Deep Learning Container License.
By pulling and using the container, you accept the terms and conditions of this license:
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license
A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.
root@d91c734b9499:/opt/app# nvidia-smi
Fri Jul 7 09:23:35 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA A800 80GB PCIe Off | 00000000:12:00.0 Off | 0 |
| N/A 34C P0 42W / 300W | 18MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 1 NVIDIA A800 80GB PCIe Off | 00000000:13:00.0 Off | 0 |
| N/A 33C P0 44W / 300W | 18MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 2 NVIDIA A800 80GB PCIe Off | 00000000:14:00.0 Off | 0 |
| N/A 35C P0 47W / 300W | 18MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 3 NVIDIA A800 80GB PCIe Off | 00000000:48:00.0 Off | 0 |
| N/A 34C P0 42W / 300W | 18MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 4 NVIDIA A800 80GB PCIe Off | 00000000:49:00.0 Off | 0 |
| N/A 34C P0 43W / 300W | 18MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 5 NVIDIA A800 80GB PCIe Off | 00000000:89:00.0 Off | 0 |
| N/A 34C P0 44W / 300W | 18MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 6 NVIDIA A800 80GB PCIe Off | 00000000:8A:00.0 Off | 0 |
| N/A 34C P0 42W / 300W | 18MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 7 NVIDIA A800 80GB PCIe Off | 00000000:C0:00.0 Off | 0 |
| N/A 34C P0 45W / 300W | 18MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 8 NVIDIA A800 80GB PCIe Off | 00000000:C1:00.0 Off | 0 |
| N/A 33C P0 44W / 300W | 18MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 9 NVIDIA A800 80GB PCIe Off | 00000000:C2:00.0 Off | 0 |
| N/A 34C P0 44W / 300W | 18MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
+---------------------------------------------------------------------------------------+
运行日志或截图
# 运行日志
root@d91c734b9499:/opt/app# bash /data/agi/Chinese-LLaMA-Alpaca/scripts/training/run_pt.sh
[2023-07-07 09:29:01,360] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-07-07 09:29:01,364] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-07-07 09:29:01,365] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-07-07 09:29:01,365] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-07-07 09:29:01,398] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-07-07 09:29:01,414] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-07-07 09:29:01,417] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-07-07 09:29:03,691] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2023-07-07 09:29:03,691] [INFO] [comm.py:594:init_distributed] cdb=None
[2023-07-07 09:29:03,785] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2023-07-07 09:29:03,785] [INFO] [comm.py:594:init_distributed] cdb=None
[2023-07-07 09:29:03,853] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2023-07-07 09:29:03,854] [INFO] [comm.py:594:init_distributed] cdb=None
[2023-07-07 09:29:03,862] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2023-07-07 09:29:03,862] [INFO] [comm.py:594:init_distributed] cdb=None
[2023-07-07 09:29:03,864] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2023-07-07 09:29:03,864] [INFO] [comm.py:594:init_distributed] cdb=None
[2023-07-07 09:29:03,864] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2023-07-07 09:29:03,867] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2023-07-07 09:29:03,867] [INFO] [comm.py:594:init_distributed] cdb=None
[2023-07-07 09:29:03,871] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2023-07-07 09:29:03,871] [INFO] [comm.py:594:init_distributed] cdb=None
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
07/07/2023 09:29:04 - WARNING - __main__ - Process rank: 5, device: cuda:5, n_gpu: 1distributed training: True, 16-bits training: True
07/07/2023 09:29:04 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: True
07/07/2023 09:29:04 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: True
[INFO|configuration_utils.py:667] 2023-07-07 09:29:04,229 >> loading configuration file /data/agi/LoRA/hf/7B_Llama_Plus/config.json
[INFO|configuration_utils.py:725] 2023-07-07 09:29:04,229 >> Model config LlamaConfig {
"_name_or_path": "/data/agi/LoRA/hf/7B_Llama_Plus",
"architectures": [
"LlamaForCausalLM"
],
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 11008,
"max_position_embeddings": 2048,
"model_type": "llama",
"num_attention_heads": 32,
"num_hidden_layers": 32,
"pad_token_id": 0,
"rms_norm_eps": 1e-06,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.30.2",
"use_cache": true,
"vocab_size": 49953
}
[INFO|tokenization_utils_base.py:1821] 2023-07-07 09:29:04,230 >> loading file tokenizer.model
[INFO|tokenization_utils_base.py:1821] 2023-07-07 09:29:04,230 >> loading file added_tokens.json
[INFO|tokenization_utils_base.py:1821] 2023-07-07 09:29:04,230 >> loading file special_tokens_map.json
[INFO|tokenization_utils_base.py:1821] 2023-07-07 09:29:04,230 >> loading file tokenizer_config.json
07/07/2023 09:29:04 - WARNING - __main__ - Process rank: 4, device: cuda:4, n_gpu: 1distributed training: True, 16-bits training: True
07/07/2023 09:29:04 - WARNING - __main__ - Process rank: 3, device: cuda:3, n_gpu: 1distributed training: True, 16-bits training: True
07/07/2023 09:29:04 - WARNING - __main__ - Process rank: 6, device: cuda:6, n_gpu: 1distributed training: True, 16-bits training: True
07/07/2023 09:29:05 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: True
07/07/2023 09:29:05 - INFO - datasets.builder - Using custom data configuration default-4e021b6fe6b72b11
07/07/2023 09:29:05 - INFO - datasets.info - Loading Dataset Infos from /opt/app/envs/venv_peft_13e53fc/lib/python3.10/site-packages/datasets/packaged_modules/text
07/07/2023 09:29:05 - INFO - datasets.builder - Generating dataset text (/data/agi/nh/cache/txt/knowledge_text/text/default-4e021b6fe6b72b11/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)
Downloading and preparing dataset text/default to /data/agi/nh/cache/txt/knowledge_text/text/default-4e021b6fe6b72b11/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...
Downloading data files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11275.01it/s]
07/07/2023 09:29:05 - INFO - datasets.download.download_manager - Downloading took 0.0 min
07/07/2023 09:29:05 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min
Extracting data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1593.58it/s]
07/07/2023 09:29:05 - INFO - datasets.builder - Generating train split
07/07/2023 09:29:05 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.
Dataset text downloaded and prepared to /data/agi/nh/cache/txt/knowledge_text/text/default-4e021b6fe6b72b11/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 848.53it/s]
07/07/2023 09:29:05 - INFO - __main__ - knowledge.txt has been loaded
07/07/2023 09:29:05 - INFO - datasets.arrow_dataset - Process #0 will write at /data/agi/nh/cache/txt/knowledge_text/tokenized_00000_of_00008.arrow
07/07/2023 09:29:05 - INFO - datasets.arrow_dataset - Process #1 will write at /data/agi/nh/cache/txt/knowledge_text/tokenized_00001_of_00008.arrow
07/07/2023 09:29:05 - INFO - datasets.arrow_dataset - Process #2 will write at /data/agi/nh/cache/txt/knowledge_text/tokenized_00002_of_00008.arrow
07/07/2023 09:29:05 - INFO - datasets.arrow_dataset - Process #3 will write at /data/agi/nh/cache/txt/knowledge_text/tokenized_00003_of_00008.arrow
07/07/2023 09:29:05 - INFO - datasets.arrow_dataset - Process #4 will write at /data/agi/nh/cache/txt/knowledge_text/tokenized_00004_of_00008.arrow
07/07/2023 09:29:05 - INFO - datasets.arrow_dataset - Process #5 will write at /data/agi/nh/cache/txt/knowledge_text/tokenized_00005_of_00008.arrow
07/07/2023 09:29:05 - INFO - datasets.arrow_dataset - Process #6 will write at /data/agi/nh/cache/txt/knowledge_text/tokenized_00006_of_00008.arrow
07/07/2023 09:29:05 - INFO - datasets.arrow_dataset - Process #7 will write at /data/agi/nh/cache/txt/knowledge_text/tokenized_00007_of_00008.arrow
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1337 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -7) local_rank: 0 (pid: 1336) of binary: /opt/app/envs/venv_peft_13e53fc/bin/python
Traceback (most recent call last):
File "/opt/app/envs/venv_peft_13e53fc/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/opt/app/envs/venv_peft_13e53fc/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/opt/app/envs/venv_peft_13e53fc/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/opt/app/envs/venv_peft_13e53fc/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/opt/app/envs/venv_peft_13e53fc/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/app/envs/venv_peft_13e53fc/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
/data/agi/Chinese-LLaMA-Alpaca/scripts/training/run_clm_pt_with_peft.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2023-07-07_09:29:19
host : d91c734b9499
rank : 2 (local_rank: 2)
exitcode : -7 (pid: 1338)
error_file: <N/A>
traceback : Signal 7 (SIGBUS) received by PID 1338
[2]:
time : 2023-07-07_09:29:19
host : d91c734b9499
rank : 3 (local_rank: 3)
exitcode : -7 (pid: 1339)
error_file: <N/A>
traceback : Signal 7 (SIGBUS) received by PID 1339
[3]:
time : 2023-07-07_09:29:19
host : d91c734b9499
rank : 4 (local_rank: 4)
exitcode : -7 (pid: 1340)
error_file: <N/A>
traceback : Signal 7 (SIGBUS) received by PID 1340
[4]:
time : 2023-07-07_09:29:19
host : d91c734b9499
rank : 5 (local_rank: 5)
exitcode : -7 (pid: 1341)
error_file: <N/A>
traceback : Signal 7 (SIGBUS) received by PID 1341
[5]:
time : 2023-07-07_09:29:19
host : d91c734b9499
rank : 6 (local_rank: 6)
exitcode : -7 (pid: 1342)
error_file: <N/A>
traceback : Signal 7 (SIGBUS) received by PID 1342
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-07-07_09:29:19
host : d91c734b9499
rank : 0 (local_rank: 0)
exitcode : -7 (pid: 1336)
error_file: <N/A>
traceback : Signal 7 (SIGBUS) received by PID 1336
============================================================
提交前必须检查以下项目
问题类型
模型训练与精调
基础模型
LLaMA-Plus-7B
操作系统
Linux
详细描述问题
1机10卡执行训练报错,torchrun 的 --nproc_per_node 配置
2
时正常,配置为大于2
的数值后报错。请问哪位朋友处理过类似问题。 依赖库的版本目前看也都是正常的,但是执行大于2卡训练的命令时会报错。内存信息:
依赖情况(代码类问题务必提供)
运行日志或截图