Closed hunwenpinghao closed 1 year ago
训练时报错:
FAILED: multi_tensor_adam.cuda.o /ssd/wphu/anaconda3/bin/nvcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/dee$ speed/ops/csrc/includes -I/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/deepspeed/ops/csrc/adam -isystem /ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/torch/include -isystem /ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-pack$ ges/torch/include/torch/csrc/api/include -isystem /ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/torch/include/TH -isystem /ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/torch/include/THC -isystem /ssd/wphu/anaconda3/include -isystem /s sd/wphu/anaconda3/envs/visualglm/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -std=c++14 -c /ssd/wphu/anaconda3/envs/visualglm/ lib/python3.10/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o /bin/sh: 1: /ssd/wphu/anaconda3/bin/nvcc: not found [2/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site- packages/deepspeed/ops/csrc/includes -I/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/deepspeed/ops/csrc/adam -isystem /ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/torch/include -isystem /ssd/wphu/anaconda3/envs/visualglm/lib/python3. 10/site-packages/torch/include/torch/csrc/api/include -isystem /ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/torch/include/TH -isystem /ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/torch/include/THC -isystem /ssd/wphu/anaconda3/includ e -isystem /ssd/wphu/anaconda3/envs/visualglm/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/deepspeed/op s/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o ninja: build stopped: subcommand failed. Traceback (most recent call last): File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1808, in _run_ninja_build subprocess.run( File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/subprocess.py", line 526, in run raise CalledProcessError(retcode, process.args, subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1. The above exception was the direct cause of the following exception: Traceback (most recent call last): File "/ssd/wphu/chatglm/VisualGLM-6B/finetune_visualglm.py", line 195, in <module> training_main(args, model_cls=model, forward_step_function=forward_step, create_dataset_function=create_dataset_function, collate_fn=data_collator) File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/sat/training/deepspeed_training.py", line 98, in training_main model, optimizer = setup_model_untrainable_params_and_optimizer(args, model) File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/sat/training/deepspeed_training.py", line 161, in setup_model_untrainable_params_and_optimizer model, optimizer, _, _ = deepspeed.initialize( File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/deepspeed/__init__.py", line 165, in initialize engine = DeepSpeedEngine(args=args, File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 308, in __init__ self._configure_optimizer(optimizer, model_parameters) File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1162, in _configure_optimizer basic_optimizer = self._configure_basic_optimizer(model_parameters) File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1224, in _configure_basic_optimizer optimizer = FusedAdam( File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py", line 71, in __init__ fused_adam_cuda = FusedAdamBuilder().load() File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/deepspeed/ops/op_builder/builder.py", line 445, in load return self.jit_load(verbose) File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/deepspeed/ops/op_builder/builder.py", line 480, in jit_load op_module = load(name=self.name, File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1202, in load return _jit_compile( File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1425, in _jit_compile _write_ninja_file_and_build_library( File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1537, in _write_ninja_file_and_build_library _run_ninja_build( File "/ssd/wphu/anaconda3/envs/visualglm/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1824, in _run_ninja_build raise RuntimeError(message) from e RuntimeError: Error building extension 'fused_adam' [2023-06-05 14:26:03,449] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 64806 [2023-06-05 14:26:03,450] [ERROR] [launch.py:434:sigkill_handler] ['/ssd/wphu/anaconda3/envs/visualglm/bin/python', '-u', 'finetune_visualglm.py', '--local_rank=0', '--experiment-name', 'finetune-visualglm-6b', '--model-parallel-size', '1', '--mode', 'finetune', '--train-iters', '300', '--resume-dataloader', '--max_source_length', '64', '--max_target_length', '256', '--lora_rank', '10', '--layer_range', '0', '14', '--pre_seq_len', '4', '--train-data', './data/dataset.json', '--valid-data', './data/dataset.json', '--distributed-backend', 'nccl', '--lr-decay-style', 'cosine', '--warmup', '.02', '--checkpoint-activations', '--save-interval', '300', '--eval-interval', '10000', '--save', './checkpoints', '--split', '1', '--eval-iters', '10', '--eval-batch-size', '1', '--zero-stage', '1', '--lr', '0.0001', '--batch-size', '1', '--gradient-accumulation-steps', '4', '--skip-init', '--fp16', '--use_qlora'] exits with return code = 1
我之前遇到过同样的问题,我的解决方式是利用conda create一个新的环境并且重新安装所有依赖,这样的问题就不存在了。
可以试试设置CUDA_HOME环境变量,比如export CUDA_HOME=/usr/local/cuda-11.6(这里换成你的cuda地址)
训练时报错: