Open qxde01 opened 1 month ago
well I haven't tried zero3, check https://stackoverflow.com/questions/75517324/runtimeerror-inference-tensors-cannot-be-saved-for-backward-to-work-around-you, try change executor.py @torch.inference_mode to @torch.no_grad. Please tell me if it works
谢谢,我这里不起作用,我修改的地方是: llm.py
# @torch.inference_mode()
def inference( ... ) -> torch.Tensor:
with torch.no_grad():
device = text.device
......
仍然报错:
[2024-07-22 11:35:40,269] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint epoch_0_whole is ready now!
[2024-07-22 11:35:40,269] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint epoch_0_whole is ready now!
Traceback (most recent call last):
File "/home/gpu/CosyVoice/examples/libritts/cosyvoice/cosyvoice/bin/train.py", line 136, in <module>
main()
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/gpu/CosyVoice/examples/libritts/cosyvoice/cosyvoice/bin/train.py", line 132, in main
executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join)
File "/home/gpu/CosyVoice/cosyvoice/utils/executor.py", line 67, in train_one_epoc
info_dict = batch_forward(model, batch_dict, info_dict)
File "/home/gpu/CosyVoice/cosyvoice/utils/train_utils.py", line 212, in batch_forward
info_dict['loss_dict'] = model(batch, device)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1846, in forward
loss = self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl
result = forward_call(*args, **kwargs)
File "/home/gpu/CosyVoice/cosyvoice/llm/llm.py", line 108, in forward
text_token, text_token_len = self.encode(text_token, text_token_len)
File "/home/gpu/CosyVoice/cosyvoice/llm/llm.py", line 71, in encode
encoder_out, encoder_mask = self.text_encoder(text, text_lengths, decoding_chunk_size=1, num_decoding_left_chunks=-1)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl
result = forward_call(*args, **kwargs)
File "/home/gpu/CosyVoice/cosyvoice/transformer/encoder.py", line 145, in forward
xs, pos_emb, masks = self.embed(xs, masks)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl
result = forward_call(*args, **kwargs)
File "/home/gpu/CosyVoice/cosyvoice/transformer/subsampling.py", line 111, in forward
x = self.out(x)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl
result = forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/container.py", line 217, in forward
input = module(input)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl
result = forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/normalization.py", line 190, in forward
return F.layer_norm(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 2515, in layer_norm
return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
RuntimeError: Inference tensors cannot be saved for backward. To work around you can make a clone to get a normal tensor and use it in autograd.
Traceback (most recent call last):
File "/home/gpu/CosyVoice/examples/libritts/cosyvoice/cosyvoice/bin/train.py", line 136, in <module>
main()
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/gpu/CosyVoice/examples/libritts/cosyvoice/cosyvoice/bin/train.py", line 132, in main
executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join)
File "/home/gpu/CosyVoice/cosyvoice/utils/executor.py", line 67, in train_one_epoc
info_dict = batch_forward(model, batch_dict, info_dict)
File "/home/gpu/CosyVoice/cosyvoice/utils/train_utils.py", line 212, in batch_forward
info_dict['loss_dict'] = model(batch, device)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1846, in forward
loss = self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl
result = forward_call(*args, **kwargs)
File "/home/gpu/CosyVoice/cosyvoice/llm/llm.py", line 108, in forward
text_token, text_token_len = self.encode(text_token, text_token_len)
File "/home/gpu/CosyVoice/cosyvoice/llm/llm.py", line 71, in encode
encoder_out, encoder_mask = self.text_encoder(text, text_lengths, decoding_chunk_size=1, num_decoding_left_chunks=-1)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl
result = forward_call(*args, **kwargs)
File "/home/gpu/CosyVoice/cosyvoice/transformer/encoder.py", line 145, in forward
xs, pos_emb, masks = self.embed(xs, masks)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl
result = forward_call(*args, **kwargs)
File "/home/gpu/CosyVoice/cosyvoice/transformer/subsampling.py", line 111, in forward
x = self.out(x)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl
result = forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/container.py", line 217, in forward
input = module(input)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl
result = forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/normalization.py", line 190, in forward
return F.layer_norm(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 2515, in layer_norm
return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
RuntimeError: Inference tensors cannot be saved for backward. To work around you can make a clone to get a normal tensor and use it in autograd.
我的训练参数是:
pretrained_model_dir=../../../pretrained_models/CosyVoice-300M-Instruct
export CUDA_VISIBLE_DEVICES="0,1,2,3"
job_id=1986
#nccl
dist_backend=nccl
num_workers=2
prefetch=100
train_engine=deepspeed
#torch_ddp
torchrun --nnodes=1 --nproc_per_node=4 --master_port=9901 cosyvoice/bin/train.py \
--train_engine $train_engine \
--config conf/cosyvoice.yaml \
--train_data data/train2.data.list \
--cv_data data/dev.data.list \
--model llm \
--checkpoint $pretrained_model_dir/llm.pt \
--model_dir `pwd`/exp/llm/$train_engine \
--tensorboard_dir `pwd`/tensorboard/llm/$train_engine \
--ddp.dist_backend $dist_backend \
--num_workers ${num_workers} \
--prefetch ${prefetch} \
--pin_memory \
--timeout 600 \
--deepspeed_config ./conf/ds_stage3.json \
--deepspeed.save_states model_only
谢谢,我这里不起作用,我修改的地方是: llm.py
# @torch.inference_mode() def inference( ... ) -> torch.Tensor: with torch.no_grad(): device = text.device ......
仍然报错:
[2024-07-22 11:35:40,269] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint epoch_0_whole is ready now! [2024-07-22 11:35:40,269] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint epoch_0_whole is ready now! Traceback (most recent call last): File "/home/gpu/CosyVoice/examples/libritts/cosyvoice/cosyvoice/bin/train.py", line 136, in <module> main() File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper return f(*args, **kwargs) File "/home/gpu/CosyVoice/examples/libritts/cosyvoice/cosyvoice/bin/train.py", line 132, in main executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join) File "/home/gpu/CosyVoice/cosyvoice/utils/executor.py", line 67, in train_one_epoc info_dict = batch_forward(model, batch_dict, info_dict) File "/home/gpu/CosyVoice/cosyvoice/utils/train_utils.py", line 212, in batch_forward info_dict['loss_dict'] = model(batch, device) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1846, in forward loss = self.module(*inputs, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/home/gpu/CosyVoice/cosyvoice/llm/llm.py", line 108, in forward text_token, text_token_len = self.encode(text_token, text_token_len) File "/home/gpu/CosyVoice/cosyvoice/llm/llm.py", line 71, in encode encoder_out, encoder_mask = self.text_encoder(text, text_lengths, decoding_chunk_size=1, num_decoding_left_chunks=-1) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/home/gpu/CosyVoice/cosyvoice/transformer/encoder.py", line 145, in forward xs, pos_emb, masks = self.embed(xs, masks) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/home/gpu/CosyVoice/cosyvoice/transformer/subsampling.py", line 111, in forward x = self.out(x) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/container.py", line 217, in forward input = module(input) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/normalization.py", line 190, in forward return F.layer_norm( File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 2515, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: Inference tensors cannot be saved for backward. To work around you can make a clone to get a normal tensor and use it in autograd. Traceback (most recent call last): File "/home/gpu/CosyVoice/examples/libritts/cosyvoice/cosyvoice/bin/train.py", line 136, in <module> main() File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper return f(*args, **kwargs) File "/home/gpu/CosyVoice/examples/libritts/cosyvoice/cosyvoice/bin/train.py", line 132, in main executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join) File "/home/gpu/CosyVoice/cosyvoice/utils/executor.py", line 67, in train_one_epoc info_dict = batch_forward(model, batch_dict, info_dict) File "/home/gpu/CosyVoice/cosyvoice/utils/train_utils.py", line 212, in batch_forward info_dict['loss_dict'] = model(batch, device) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1846, in forward loss = self.module(*inputs, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/home/gpu/CosyVoice/cosyvoice/llm/llm.py", line 108, in forward text_token, text_token_len = self.encode(text_token, text_token_len) File "/home/gpu/CosyVoice/cosyvoice/llm/llm.py", line 71, in encode encoder_out, encoder_mask = self.text_encoder(text, text_lengths, decoding_chunk_size=1, num_decoding_left_chunks=-1) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/home/gpu/CosyVoice/cosyvoice/transformer/encoder.py", line 145, in forward xs, pos_emb, masks = self.embed(xs, masks) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/home/gpu/CosyVoice/cosyvoice/transformer/subsampling.py", line 111, in forward x = self.out(x) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/container.py", line 217, in forward input = module(input) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1538, in _call_impl result = forward_call(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/normalization.py", line 190, in forward return F.layer_norm( File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 2515, in layer_norm return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled) RuntimeError: Inference tensors cannot be saved for backward. To work around you can make a clone to get a normal tensor and use it in autograd.
我的训练参数是:
pretrained_model_dir=../../../pretrained_models/CosyVoice-300M-Instruct export CUDA_VISIBLE_DEVICES="0,1,2,3" job_id=1986 #nccl dist_backend=nccl num_workers=2 prefetch=100 train_engine=deepspeed #torch_ddp torchrun --nnodes=1 --nproc_per_node=4 --master_port=9901 cosyvoice/bin/train.py \ --train_engine $train_engine \ --config conf/cosyvoice.yaml \ --train_data data/train2.data.list \ --cv_data data/dev.data.list \ --model llm \ --checkpoint $pretrained_model_dir/llm.pt \ --model_dir `pwd`/exp/llm/$train_engine \ --tensorboard_dir `pwd`/tensorboard/llm/$train_engine \ --ddp.dist_backend $dist_backend \ --num_workers ${num_workers} \ --prefetch ${prefetch} \ --pin_memory \ --timeout 600 \ --deepspeed_config ./conf/ds_stage3.json \ --deepspeed.save_states model_only
not llm.py. try change https://github.com/FunAudioLLM/CosyVoice/blob/main/cosyvoice/utils/executor.py#L82 to torch.no_grad()
Thank you. It‘s works after changed executor.py。
Describe the bug 使用deepspeed zero3微调时,只需11G显存,我用的是4X1080Ti,CUDA 12.1 ,torch 2.0.1 但是每当epoch结束保存模型时,总会出现下面的错误:
zero3配置
请问如何解决这个问题,谢谢。