Open swyam opened 1 year ago
same error here, running python 3.10.7 on ubuntu 23.04
I managed to make it work by installing pip install deepspeed==0.8.3 instead of deepspeed latest see https://github.com/microsoft/DeepSpeed/issues/3309#issuecomment-1515068518
Use pip install deepspeed==0.8.3
get error:
root@e3677efd571d:/workspace/vall-e# python -m vall_e.train yaml=config/LibriTTS/ar.yml
1222it [00:00, 39223.40it/s]
2023-10-25 18:39:07 - vall_e.data - INFO - GR=0;LR=0 -
{'</s>': 1, '<s>': 2, 'AA0': 3, 'AA1': 4, 'AA2': 5, 'AE1': 6, 'AE2': 7, 'AH0': 8, 'AH1': 9, 'AO1': 10, 'AW1': 11, 'AY1': 12, 'AY2': 13, 'B': 14, 'CH': 15, 'D': 16, 'DH': 17, 'EH0': 18, 'EH1': 19, 'ER0': 20, 'ER1': 21, 'EY1': 22, 'EY2': 23, 'F': 24, 'G': 25, 'HH': 26, 'IH0': 27, 'IH1': 28, 'IY0': 29, 'IY1': 30, 'JH': 31, 'K': 32, 'L': 33, 'M': 34, 'N': 35, 'NG': 36, 'OW1': 37, 'OW2': 38, 'P': 39, 'R': 40, 'S': 41, 'SH': 42, 'T': 43, 'TH': 44, 'UH1': 45, 'UW0': 46, 'UW1': 47, 'V': 48, 'W': 49, 'Y': 50, 'Z': 51, '_': 52}
2023-10-25 18:39:07 - vall_e.data - INFO - GR=0;LR=0 -
{'data': 0}
2023-10-25 18:39:07 - vall_e.data - INFO - GR=0;LR=0 -
#samples (train): 41.
2023-10-25 18:39:07 - vall_e.data - INFO - GR=0;LR=0 -
#samples (val): 8.
[2023-10-25 18:39:08,024] [INFO] [comm.py:652:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2023-10-25 18:39:09,114] [INFO] [logging.py:93:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/fused_adam/build.ninja...
Building extension module fused_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[1/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/opt/conda/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -I/opt/conda/lib/python3.10/site-packages/deepspeed/ops/csrc/adam -isystem /opt/conda/lib/python3.10/site-packages/torch/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/TH -isystem /opt/conda/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /opt/conda/lib/python3.10/site-packages/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o
FAILED: fused_adam_frontend.o
c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/opt/conda/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -I/opt/conda/lib/python3.10/site-packages/deepspeed/ops/csrc/adam -isystem /opt/conda/lib/python3.10/site-packages/torch/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/TH -isystem /opt/conda/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /opt/conda/lib/python3.10/site-packages/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o
In file included from /opt/conda/lib/python3.10/site-packages/torch/include/torch/extension.h:5,
from /opt/conda/lib/python3.10/site-packages/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp:1:
/opt/conda/lib/python3.10/site-packages/torch/include/torch/csrc/api/include/torch/all.h:4:2: error: #error C++17 or later
[2/3] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/opt/conda/lib/python3.10/site-packages/deepspeed/ops/csrc/includes -I/opt/conda/lib/python3.10/site-packages/deepspeed/ops/csrc/adam -isystem /opt/conda/lib/python3.10/site-packages/torch/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/lib/python3.10/site-packages/torch/include/TH -isystem /opt/conda/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 --compiler-options '-fPIC' -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_86,code=compute_86 -std=c++17 -c /opt/conda/lib/python3.10/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o
ninja: build stopped: subcommand failed.
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build
subprocess.run(
File "/opt/conda/lib/python3.10/subprocess.py", line 526, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/workspace/vall-e/vall_e/train.py", line 128, in <module>
main()
File "/workspace/vall-e/vall_e/train.py", line 119, in main
trainer.train(
File "/workspace/vall-e/vall_e/utils/trainer.py", line 125, in train
engines = engines_loader()
File "/workspace/vall-e/vall_e/train.py", line 21, in load_engines
model=trainer.Engine(
File "/workspace/vall-e/vall_e/utils/engines.py", line 22, in __init__
super().__init__(None, *args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 340, in __init__
self._configure_optimizer(optimizer, model_parameters)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1283, in _configure_optimizer
basic_optimizer = self._configure_basic_optimizer(model_parameters)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1360, in _configure_basic_optimizer
optimizer = FusedAdam(
File "/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py", line 73, in __init__
fused_adam_cuda = FusedAdamBuilder().load()
File "/opt/conda/lib/python3.10/site-packages/deepspeed/ops/op_builder/builder.py", line 485, in load
return self.jit_load(verbose)
File "/opt/conda/lib/python3.10/site-packages/deepspeed/ops/op_builder/builder.py", line 520, in jit_load
op_module = load(
File "/opt/conda/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1308, in load
return _jit_compile(
File "/opt/conda/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile
_write_ninja_file_and_build_library(
File "/opt/conda/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library
_run_ninja_build(
File "/opt/conda/lib/python3.10/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build
raise RuntimeError(message) from e
RuntimeError: Error building extension 'fused_adam'
pip install deepspeed==0.8.3
pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
I got the following error:
ImportError: /root/.cache/torch_extensions/py310_cu118/fused_adam/fused_adam.so: undefined symbol: _ZN3c104cuda9SetDeviceEi
After running the following two commands:
pip install deepspeed==0.8.3
pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
!mkdir -p zoo !python -m vall_e.export zoo/ar.pt yaml=config/test/ar.yml !python -m vall_e.export zoo/nar.pt yaml=config/test/nar.yml
Traceback (most recent call last): File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/content/vall-e/vall_e/export.py", line 25, in <module> main() File "/content/vall-e/vall_e/export.py", line 14, in main engine = load_engines() File "/content/vall-e/vall_e/train.py", line 21, in load_engines model=trainer.Engine( File "/content/vall-e/vall_e/utils/engines.py", line 22, in __init__ super().__init__(None, *args, **kwargs) File "/usr/local/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 244, in __init__ self._do_sanity_check() File "/usr/local/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 968, in _do_sanity_check if self.optimizer_name() is not None: File "/usr/local/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 628, in optimizer_name return (self.client_optimizer.__class__.__name__ if self.client_optimizer else self._config.optimizer_name) AttributeError: 'NoneType' object has no attribute 'optimizer_name'
can anyone help me how to solve this on while running on colab for demo