Open unography opened 1 month ago
+1 similar error on optimum-neuron
version: 0.0.25 and Neuron SDK 2.20
Traceback (most recent call last):
File "/home/ubuntu/ml-specialized-hardware/purpose-built-accelerators/notebooks/src/train.py", line 101, in <module>
trainer.train()
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/optimum/neuron/trainers.py", line 1456, in train
result = super().train(
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/transformers/trainer.py", line 1938, in train
return inner_training_loop(
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/optimum/neuron/utils/require_utils.py", line 51, in wrapper
return func(*args, **kwargs)
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/optimum/neuron/trainers.py", line 1096, in _inner_training_loop
self.optimizer.step()
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/optimum/neuron/utils/require_utils.py", line 51, in wrapper
return func(*args, **kwargs)
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/optimum/neuron/accelerate/optimizer.py", line 104, in step
self.optimizer.step(closure)
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
out = func(*args, **kwargs)
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/torch_xla/distributed/zero_redundancy_optimizer.py", line 336, in step
self._clip_grad_norm(max_norm=self.max_norm)
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/neuronx_distributed/optimizer/zero_redundancy_optimizer.py", line 98, in _clip_grad_norm
all_parameters, self._grad_norm = self._get_params_and_grad_norm(norm_type)
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/neuronx_distributed/optimizer/zero_redundancy_optimizer.py", line 83, in _get_params_and_grad_norm
grad_norm = get_grad_norm(
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/neuronx_distributed/parallel_layers/grads.py", line 116, in get_grad_norm
device = parameters[0].device
IndexError: list index out of range
98%|█████████▊| 1000/1022 [02:22<00:03, 7.01it/s]
[2024-10-15 14:16:00,812] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 7613) of binary: /opt/aws_neuronx_venv_pytorch_2_1/bin/python3
Traceback (most recent call last):
File "/opt/aws_neuronx_venv_pytorch_2_1/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/aws_neuronx_venv_pytorch_2_1/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
System Info
Who can help?
@michaelbenayoun
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction (minimal, reproducible, runnable)
Pretraining a TinyLLama like model, using the same tokenizer as TinyLlama on the wikitext dataset.
Create a Trainium instance following the steps here
Get the official training script:
Export BF16 var
Compile:
Train:
Error with stack trace:
There is no error if
zero1
isn't used, training runs as expectedExpected behavior
Model resumes training correctly after saving the checkpoint