🐛 Describe the bug

Training GPT using two RX6800 (Navi 21)

`from transformers import TrainingArguments

training_args = TrainingArguments( output_dir=model_path, # output directory to where save model checkpoint evaluation_strategy="steps", # evaluate each logging_steps steps overwrite_output_dir=True,
num_train_epochs=10, # number of training epochs, feel free to tweak per_device_train_batch_size=4, # the training batch size, put it as high as your GPU memory fits gradient_accumulation_steps=8, # accumulating the gradients before updating the weights per_device_eval_batch_size=2, # evaluation batch size logging_steps=100, # evaluate, log and save model checkpoints every 1000 step save_steps=1000, eval_steps=1000,

load_best_model_at_end=True, # whether to load the best model (in terms of loss) at the end of training

save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
fp16=True

and

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()

yields this error in hugging face opengpt

` Running training Num examples = 637416 Num Epochs = 10 Instantaneous batch size per device = 4 Total train batch size (w. parallel, distributed & accumulation) = 64 Gradient Accumulation steps = 8 Total optimization steps = 99590

StopIteration Traceback (most recent call last) Cell In [198], line 1 ----> 1 trainer.train()

File ~/.venvs/pt/lib/python3.8/site-packages/transformers/trainer.py:1500, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 1495 self.model_wrapped = self.model 1497 inner_training_loop = find_executable_batch_size( 1498 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size 1499 ) -> 1500 return inner_training_loop( 1501 args=args, 1502 resume_from_checkpoint=resume_from_checkpoint, 1503 trial=trial, 1504 ignore_keys_for_eval=ignore_keys_for_eval, 1505 )

File ~/.venvs/pt/lib/python3.8/site-packages/transformers/trainer.py:1742, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval) 1740 tr_loss_step = self.training_step(model, inputs) 1741 else: -> 1742 tr_loss_step = self.training_step(model, inputs) 1744 if ( 1745 args.logging_nan_inf_filter 1746 and not is_torch_tpu_available() 1747 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) 1748 ): 1749 # if loss is nan or inf simply add the average of previous logged losses 1750 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File ~/.venvs/pt/lib/python3.8/site-packages/transformers/trainer.py:2486, in Trainer.training_step(self, model, inputs) 2483 return loss_mb.reduce_mean().detach().to(self.args.device) 2485 with self.compute_loss_context_manager(): -> 2486 loss = self.compute_loss(model, inputs) 2488 if self.args.n_gpu > 1: 2489 loss = loss.mean() # mean() to average on multi-gpu parallel training

File ~/.venvs/pt/lib/python3.8/site-packages/transformers/trainer.py:2518, in Trainer.compute_loss(self, model, inputs, return_outputs) 2516 else: 2517 labels = None -> 2518 outputs = model(**inputs) 2519 # Save past state if it exists 2520 # TODO: this needs to be fixed and made cleaner later. 2521 if self.args.past_index >= 0:

File ~/.venvs/pt/lib/python3.8/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, *kwargs) 1126 # If we don't have any hooks, we want to skip the rest of the logic in 1127 # this function, and just call forward. 1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1129 or _global_forward_hooks or _global_forward_pre_hooks): -> 1130 return forward_call(input, **kwargs) 1131 # Do not call functions when jit is used 1132 full_backward_hooks, non_full_backward_hooks = [], []

File ~/.venvs/pt/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py:168, in DataParallel.forward(self, *inputs, *kwargs) 166 return self.module(inputs[0], **kwargs[0]) 167 replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) --> 168 outputs = self.parallel_apply(replicas, inputs, kwargs) 169 return self.gather(outputs, self.output_device)

File ~/.venvs/pt/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py:178, in DataParallel.parallel_apply(self, replicas, inputs, kwargs) 177 def parallel_apply(self, replicas, inputs, kwargs): --> 178 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])

File ~/.venvs/pt/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py:86, in parallel_apply(modules, inputs, kwargs_tup, devices) 84 output = results[i] 85 if isinstance(output, ExceptionWrapper): ---> 86 output.reraise() 87 outputs.append(output) 88 return outputs

File ~/.venvs/pt/lib/python3.8/site-packages/torch/_utils.py:461, in ExceptionWrapper.reraise(self) 457 except TypeError: 458 # If the exception takes multiple arguments, don't try to 459 # instantiate since we don't know how to 460 raise RuntimeError(msg) from None --> 461 raise exception

StopIteration: Caught StopIteration in replica 0 on device 0. Original Traceback (most recent call last): File "/home/ian/.venvs/pt/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker output = module(*input, *kwargs) File "/home/ian/.venvs/pt/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(input, **kwargs) File "/home/ian/.venvs/pt/lib/python3.8/site-packages/transformers/models/openai/modeling_openai.py", line 481, in forward attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility StopIteration`

Versions

Collecting environment information... PyTorch version: 1.12.1+rocm5.1.1 Is debug build: False CUDA used to build PyTorch: N/A ROCM used to build PyTorch: 5.1.20531-cacfa990

OS: Ubuntu 22.04.1 LTS (x86_64) GCC version: (Ubuntu 11.2.0-19ubuntu1) 11.2.0 Clang version: Could not collect CMake version: Could not collect Libc version: glibc-2.35

Python version: 3.8.15 (default, Oct 12 2022, 19:15:16) [GCC 11.2.0] (64-bit runtime) Python platform: Linux-5.15.0-52-generic-x86_64-with-glibc2.35 Is CUDA available: True CUDA runtime version: Could not collect CUDA_MODULE_LOADING set to: GPU models and configuration: AMD Radeon RX 6800 XT Nvidia driver version: Could not collect cuDNN version: Could not collect HIP runtime version: 5.3.22061 MIOpen runtime version: 2.16.0 Is XNNPACK available: True

Versions of relevant libraries: [pip3] numpy==1.23.4 [pip3] torch==1.12.1+rocm5.1.1 [pip3] torchaudio==0.12.1+rocm5.1.1 [pip3] torchvision==0.13.1+rocm5.1.1 [conda] Could not collect

ROCm / pytorch

fp16 compatibility #1129

🐛 Describe the bug

load_best_model_at_end=True, # whether to load the best model (in terms of loss) at the end of training

` Running training Num examples = 637416 Num Epochs = 10 Instantaneous batch size per device = 4 Total train batch size (w. parallel, distributed & accumulation) = 64 Gradient Accumulation steps = 8 Total optimization steps = 99590

Versions