Tencent / HunyuanDiT

Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding
https://dit.hunyuan.tencent.com/
Other
2.59k stars 180 forks source link

cuda12难道不能训练吗,非要cuda11? #103

Closed sunhaha123 closed 1 week ago

sunhaha123 commented 1 week ago

Traceback (most recent call last): File "hydit/train_deepspeed.py", line 517, in main(get_args()) File "hydit/train_deepspeed.py", line 368, in main model, opt, scheduler = deepspeed_initialize(args, logger, model, opt, deepspeed_config) File "hydit/train_deepspeed.py", line 47, in deepspeedinitialize model, opt, , scheduler = deepspeed.initialize( File "/home/echo/miniconda3/envs/hunyuan/lib/python3.8/site-packages/deepspeed/init.py", line 119, in initialize engine = DeepSpeedEngine(args=args, File "/home/echo/miniconda3/envs/hunyuan/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 294, in init self._configure_optimizer(optimizer, model_parameters) File "/home/echo/miniconda3/envs/hunyuan/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1054, in _configure_optimizer basic_optimizer = self._configure_basic_optimizer(model_parameters) File "/home/echo/miniconda3/envs/hunyuan/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1147, in _configure_basic_optimizer optimizer = FusedAdam( File "/home/echo/miniconda3/envs/hunyuan/lib/python3.8/site-packages/deepspeed/ops/adam/fused_adam.py", line 73, in init fused_adam_cuda = FusedAdamBuilder().load() File "/home/echo/miniconda3/envs/hunyuan/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 463, in load return self.jit_load(verbose) File "/home/echo/miniconda3/envs/hunyuan/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 478, in jit_load assert_no_cuda_mismatch() File "/home/echo/miniconda3/envs/hunyuan/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 101, in assert_no_cuda_mismatch raise Exception( Exception: Installed CUDA version 12.5 does not match the version torch was compiled with 11.7, unable to compile cuda/cpp extensions without a matching cuda version.

h3clikejava commented 1 week ago

Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) [1/3] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/ops/csrc/includes -I/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/ops/csrc/adam -isystem /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/include -isystem /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/include/TH -isystem /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/timehut/miniconda3/envs/HunyuanDiT/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -DCUDA_NO_HALF_OPERATORS -DCUDA_NO_HALF_CONVERSIONS -DCUDA_NO_BFLOAT16_CONVERSIONS -DCUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_89,code=compute_89 -DBF16_AVAILABLE -UCUDA_NO_BFLOAT16_OPERATORS -UCUDA_NO_BFLOAT162_OPERATORS -UCUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++14 -c /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o FAILED: multi_tensor_adam.cuda.o /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/ops/csrc/includes -I/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/ops/csrc/adam -isystem /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/include -isystem /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/include/TH -isystem /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/timehut/miniconda3/envs/HunyuanDiT/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -DCUDA_NO_HALF_OPERATORS -DCUDA_NO_HALF_CONVERSIONS -DCUDA_NO_BFLOAT16_CONVERSIONS -DCUDA_NO_HALF2_OPERATORS --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_89,code=compute_89 -DBF16_AVAILABLE -UCUDA_NO_BFLOAT16_OPERATORS -UCUDA_NO_BFLOAT162_OPERATORS -UCUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++14 -c /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o nvcc fatal : Unsupported gpu architecture 'compute_89' [2/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/ops/csrc/includes -I/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/ops/csrc/adam -isystem /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/include -isystem /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/include/TH -isystem /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/timehut/miniconda3/envs/HunyuanDiT/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++17 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -DBF16_AVAILABLE -c /home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o ninja: build stopped: subcommand failed. Traceback (most recent call last): File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1900, in _run_ninja_build subprocess.run( File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/subprocess.py", line 516, in run raise CalledProcessError(retcode, process.args, subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.

The above exception was the direct cause of the following exception:

Traceback (most recent call last): File "hydit/train_deepspeed.py", line 517, in main(get_args()) File "hydit/train_deepspeed.py", line 368, in main model, opt, scheduler = deepspeed_initialize(args, logger, model, opt, deepspeed_config) File "hydit/train_deepspeed.py", line 47, in deepspeedinitialize model, opt, , scheduler = deepspeed.initialize( File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/init.py", line 181, in initialize engine = DeepSpeedEngine(args=args, File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 306, in init self._configure_optimizer(optimizer, model_parameters) File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1224, in _configure_optimizer basic_optimizer = self._configure_basic_optimizer(model_parameters) File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1301, in _configure_basic_optimizer optimizer = FusedAdam( File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/ops/adam/fused_adam.py", line 94, in init fused_adam_cuda = FusedAdamBuilder().load() File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 508, in load return self.jit_load(verbose) File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 555, in jit_load op_module = load(name=self.name, File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1284, in load return _jit_compile( File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1508, in _jit_compile _write_ninja_file_and_build_library( File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1623, in _write_ninja_file_and_build_library _run_ninja_build( File "/home/timehut/miniconda3/envs/HunyuanDiT/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1916, in _run_ninja_build raise RuntimeError(message) from e RuntimeError: Error building extension 'fused_adam' [2024-06-20 17:44:15,202] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2877789

我这是什么错误?