Open visionshao opened 11 months ago
原因: conda安装的cuda不包含h文件,deepspeed初始化需要用到一些头文件【cuda_fp16.h等】 需要安装完整版的cuda install cuda to /usr/local
reference links: cuda安装 https://zhuanlan.zhihu.com/p/490246520 https://zhuanlan.zhihu.com/p/367740437
CUDA理解 https://blog.csdn.net/qq_41094058/article/details/116207333 https://zhuanlan.zhihu.com/p/91334380
Detected CUDA files, patching ldflags Emitting ninja build file /mnt/cache/weishao4/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... Building extension module cpu_adam... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) [1/3] /mnt/cache/weishao4/anaconda3/envs/toxicity/bin/nvcc -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -I/mnt/cache/weishao4/anaconda3/envs/toxicity/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include/TH -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include/THC -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -DCUDA_NO_HALF_OPERATORS -DCUDA_NO_HALF_CONVERSIONS -DCUDA_NO_BFLOAT16_CONVERSIONS -DCUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++17 -UCUDA_NO_HALF_OPERATORS -UCUDA_NO_HALF_CONVERSIONS -UCUDA_NO_HALF2_OPERATORS -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_86,code=compute_86 -DBF16_AVAILABLE -c /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/csrc/common/custom_cuda_kernel.cu -o custom_cuda_kernel.cuda.o FAILED: custom_cuda_kernel.cuda.o /mnt/cache/weishao4/anaconda3/envs/toxicity/bin/nvcc -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -I/mnt/cache/weishao4/anaconda3/envs/toxicity/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include/TH -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include/THC -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -DCUDA_NO_HALF_OPERATORS -DCUDA_NO_HALF_CONVERSIONS -DCUDA_NO_BFLOAT16_CONVERSIONS -DCUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++17 -UCUDA_NO_HALF_OPERATORS -UCUDA_NO_HALF_CONVERSIONS -UCUDA_NO_HALF2_OPERATORS -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_86,code=compute_86 -DBF16_AVAILABLE -c /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/csrc/common/custom_cuda_kernel.cu -o custom_cuda_kernel.cuda.o cc1plus: fatal error: cuda_runtime.h: No such file or directory compilation terminated. [2/3] c++ -MMD -MF cpu_adam.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -I/mnt/cache/weishao4/anaconda3/envs/toxicity/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include/TH -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include/THC -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++17 -g -Wno-reorder -L/mnt/cache/weishao4/anaconda3/envs/toxicity/lib64 -lcudart -lcublas -g -march=native -fopenmp -DAVX512 -DENABLE_CUDA -DBF16_AVAILABLE -c /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/csrc/adam/cpu_adam.cpp -o cpu_adam.o FAILED: cpu_adam.o c++ -MMD -MF cpu_adam.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -I/mnt/cache/weishao4/anaconda3/envs/toxicity/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include/TH -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/include/THC -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/include -isystem /mnt/cache/weishao4/anaconda3/envs/toxicity/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++17 -g -Wno-reorder -L/mnt/cache/weishao4/anaconda3/envs/toxicity/lib64 -lcudart -lcublas -g -march=native -fopenmp -DAVX512 -DENABLE_CUDA -DBF16_AVAILABLE -c /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/csrc/adam/cpu_adam.cpp -o cpu_adam.o In file included from /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/csrc/adam/cpu_adam.cpp:6: /mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/csrc/includes/cpu_adam.h:16:10: fatal error: cuda_fp16.h: No such file or directory
include
compilation terminated. ninja: build stopped: subcommand failed. Traceback (most recent call last): File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1900, in _run_ninja_build subprocess.run( File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/subprocess.py", line 528, in run raise CalledProcessError(retcode, process.args, subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "/mnt/cache/weishao4/Projects/Toxicity/LLM_fine_tune/ToxDetLLaMa/main.py", line 620, in
main()
File "/mnt/cache/weishao4/Projects/Toxicity/LLM_fine_tune/ToxDetLLaMa/main.py", line 599, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/mnt/cache/weishao4/Projects/Toxicity/LLM_fine_tune/ToxDetLLaMa/transformers/src/transformers/trainer.py", line 1648, in train
return inner_training_loop(
File "/mnt/cache/weishao4/Projects/Toxicity/LLM_fine_tune/ToxDetLLaMa/transformers/src/transformers/trainer.py", line 1717, in _inner_training_loop
deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
File "/mnt/cache/weishao4/Projects/Toxicity/LLM_fine_tune/ToxDetLLaMa/transformers/src/transformers/deepspeed.py", line 378, in deepspeed_init
deepspeedengine, optimizer, , lr_scheduler = deepspeed.initialize(kwargs)
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/init.py", line 165, in initialize
engine = DeepSpeedEngine(args=args,
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 309, in init
self._configure_optimizer(optimizer, model_parameters)
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1174, in _configure_optimizer
basic_optimizer = self._configure_basic_optimizer(model_parameters)
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1230, in _configure_basic_optimizer
optimizer = DeepSpeedCPUAdam(model_parameters,
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/adam/cpu_adam.py", line 94, in init
self.ds_opt_adam = CPUAdamBuilder().load()
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 454, in load
return self.jit_load(verbose)
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 497, in jit_load
op_module = load(name=self.name,
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1284, in load
return _jit_compile(
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1508, in _jit_compile
_write_ninja_file_and_build_library(
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1623, in _write_ninja_file_and_build_library
_run_ninja_build(
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1916, in _run_ninja_build
raise RuntimeError(message) from e
RuntimeError: Error building extension 'cpu_adam'
Loading extension module cpu_adam...
Traceback (most recent call last):
File "/mnt/cache/weishao4/Projects/Toxicity/LLM_fine_tune/ToxDetLLaMa/main.py", line 620, in
main()
File "/mnt/cache/weishao4/Projects/Toxicity/LLM_fine_tune/ToxDetLLaMa/main.py", line 599, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/mnt/cache/weishao4/Projects/Toxicity/LLM_fine_tune/ToxDetLLaMa/transformers/src/transformers/trainer.py", line 1648, in train
return inner_training_loop(
File "/mnt/cache/weishao4/Projects/Toxicity/LLM_fine_tune/ToxDetLLaMa/transformers/src/transformers/trainer.py", line 1717, in _inner_training_loop
deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
File "/mnt/cache/weishao4/Projects/Toxicity/LLM_fine_tune/ToxDetLLaMa/transformers/src/transformers/deepspeed.py", line 378, in deepspeed_init
deepspeedengine, optimizer, , lr_scheduler = deepspeed.initialize( kwargs)
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/init.py", line 165, in initialize
engine = DeepSpeedEngine(args=args,
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 309, in init
self._configure_optimizer(optimizer, model_parameters)
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1174, in _configure_optimizer
basic_optimizer = self._configure_basic_optimizer(model_parameters)
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1230, in _configure_basic_optimizer
optimizer = DeepSpeedCPUAdam(model_parameters,
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/adam/cpu_adam.py", line 94, in init
self.ds_opt_adam = CPUAdamBuilder().load()
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 454, in load
return self.jit_load(verbose)
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/op_builder/builder.py", line 497, in jit_load
op_module = load(name=self.name,
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1284, in load
return _jit_compile(
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1534, in _jit_compile
return _import_module_from_library(name, build_directory, is_python_module)
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/utils/cpp_extension.py", line 1936, in _import_module_from_library
module = importlib.util.module_from_spec(spec)
File "", line 565, in module_from_spec
File "", line 1173, in create_module
File "", line 228, in _call_with_frames_removed
ImportError: /mnt/cache/weishao4/.cache/torch_extensions/py39_cu117/cpu_adam/cpu_adam.so: cannot open shared object file: No such file or directory
Exception ignored in: <function DeepSpeedCPUAdam.del at 0x7faf8116e8b0>
Traceback (most recent call last):
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/adam/cpu_adam.py", line 102, in del
AttributeError: 'DeepSpeedCPUAdam' object has no attribute 'ds_opt_adam'
Exception ignored in: <function DeepSpeedCPUAdam.del at 0x7fc8dcf6e8b0>
Traceback (most recent call last):
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/deepspeed/ops/adam/cpu_adam.py", line 102, in del
self.ds_opt_adam.destroy_adam(self.opt_id)
AttributeError: 'DeepSpeedCPUAdam' object has no attribute 'ds_opt_adam'
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 848830) of binary: /mnt/cache/weishao4/anaconda3/envs/toxicity/bin/python
Traceback (most recent call last):
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/bin/torchrun", line 8, in
sys.exit(main())
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/mnt/cache/weishao4/anaconda3/envs/toxicity/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
/mnt/cache/weishao4/Projects/Toxicity/LLM_fine_tune/ToxDetLLaMa/main.py FAILED
Failures: [1]: time : 2023-09-27_14:00:16 host : xgcsdx-SYS-740GP-TNRT rank : 1 (local_rank: 1) exitcode : 1 (pid: 848831) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure): [0]: time : 2023-09-27_14:00:16 host : xgcsdx-SYS-740GP-TNRT rank : 0 (local_rank: 0) exitcode : 1 (pid: 848830) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Environment: A6000 80G Pytorch 1.13.1 Python 3.9 CUDA 11.7 DeepSpeed 0.9.3