Closed brewswang closed 4 months ago
cmd line: torchrun --nnodes 1 --nproc_per_node 2 llama_finetuning.py --enable_fsdp --use_peft --peft_method lora --dataset alpaca_dataset --model_name /gpt/models/Llama-2-13b-chat-hf --pure_bf16 --output_dir outputs
It seems be solved, closing this pls feel free to re-open if still seeing the issue.
System Info
pythorch :2.0.1 cuda:11.7 number of gpu: a100*2
Information
🐛 Describe the bug
I futuning allma2 13b,get error: element 0 of tensors does not require grad and does not have a grad_fn
Error logs
Traceback (most recent call last): File "/gpt/code/python/llama-recipes/llama_finetuning.py", line 237, in
fire.Fire(main)
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, *kwargs)
File "/gpt/code/python/llama-recipes/llama_finetuning.py", line 220, in main
results = train(
File "/gpt/code/python/llama-recipes/utils/train_utils.py", line 100, in train
loss.backward()
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/_tensor.py", line 487, in backward
torch.autograd.backward(
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/autograd/init.py", line 200, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
Traceback (most recent call last):
File "/gpt/code/python/llama-recipes/llama_finetuning.py", line 237, in
fire.Fire(main)
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn( varargs, kwargs)
File "/gpt/code/python/llama-recipes/llama_finetuning.py", line 220, in main
results = train(
File "/gpt/code/python/llama-recipes/utils/train_utils.py", line 100, in train
loss.backward()
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/_tensor.py", line 487, in backward
torch.autograd.backward(
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/autograd/init.py", line 200, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 15578) of binary: /root/anaconda3/envs/recipes/bin/python
Traceback (most recent call last):
File "/root/anaconda3/envs/recipes/bin/torchrun", line 8, in
sys.exit(main())
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
llama_finetuning.py FAILED**
Expected behavior
Traceback (most recent call last): File "/gpt/code/python/llama-recipes/llama_finetuning.py", line 237, in
fire.Fire(main)
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, kwargs)
File "/gpt/code/python/llama-recipes/llama_finetuning.py", line 220, in main
results = train(
File "/gpt/code/python/llama-recipes/utils/train_utils.py", line 100, in train
loss.backward()
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/_tensor.py", line 487, in backward
torch.autograd.backward(
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/autograd/init.py", line 200, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
Traceback (most recent call last):
File "/gpt/code/python/llama-recipes/llama_finetuning.py", line 237, in
fire.Fire(main)
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/fire/core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, *kwargs)
File "/gpt/code/python/llama-recipes/llama_finetuning.py", line 220, in main
results = train(
File "/gpt/code/python/llama-recipes/utils/train_utils.py", line 100, in train
loss.backward()
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/_tensor.py", line 487, in backward
torch.autograd.backward(
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/autograd/init.py", line 200, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 15578) of binary: /root/anaconda3/envs/recipes/bin/python
Traceback (most recent call last):
File "/root/anaconda3/envs/recipes/bin/torchrun", line 8, in
sys.exit(main())
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f( args, kwargs)
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/anaconda3/envs/recipes/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
llama_finetuning.py FAILED