We have a MPT MoD prefix-lm trained on llm-foundry and then exported to HuggingFace (via your scripts).
For some fine-tuning experiments with the HF model, I tried to set up dropout.
However, as soon as I do, the model crashes during training.
The training works fine without setting the the dropout parameter.
To reproduce
set up dropout in the exported model (is there an easier way?):
if "attn_dropout" in hyper_conf:
model_config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=True)
if "attn_config" in model_config.to_dict() and "attn_pdrop" in model_config.to_dict()['attn_config']:
update_dict = model_config.to_dict()['attn_config']
update_dict['attn_pdrop'] = hyper_conf['attn_dropout']
model_config.update({'attn_config': update_dict})
print("### Updated attn_dropout to %s ###" % hyper_conf['attn_dropout'])
model = AutoModelForCausalLM.from_config(model_config, trust_remote_code=True)
try to fine-tune the model via the HuggingFace trainer.
Exception
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /opt/ml/code/entry_point.py:17 in <module> │
│ │
│ 14 │ print("Parent directory: %s" % os.path.abspath("..")) │
│ 15 │ print(os.listdir("..")) │
│ 16 │ │
│ ❱ 17 │ main() │
│ 18 │
│ /opt/ml/code/ds_text2text_sci_bench/train.py:293 in main │
│ 290 │ │ │ val_ds = ds['validation'] │
│ 291 │ │ else: │
│ 292 │ │ │ val_ds = None │
│ ❱ 293 │ │ model = train_causal_model(args, hyper_conf, tokenizer, train_ │
│ 294 │ │ │
│ 295 │ │ model.eval() │
│ 296 │
│ /opt/ml/code/ds_text2text_sci_bench/train.py:380 in train_causal_model │
│ 377 │ ) │
│ 378 │ trainer = Trainer(model=model, tokenizer=tokenizer, args=training_ │
│ 379 │ │ │ │ │ data_collator=data_collator) │
│ ❱ 380 │ trainer.train() │
│ 381 │ │
│ 382 │ trainer.save_model(args.output_dir) │
│ 383 │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1555 in │
│ train │
│ 1552 │ │ │ finally: │
│ 1553 │ │ │ │ hf_hub_utils.enable_progress_bars() │
│ 1554 │ │ else: │
│ ❱ 1555 │ │ │ return inner_training_loop( │
│ 1556 │ │ │ │ args=args, │
│ 1557 │ │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1558 │ │ │ │ trial=trial, │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1860 in │
│ _inner_training_loop │
│ 1857 │ │ │ │ │ self.control = self.callback_handler.on_step_begi │
│ 1858 │ │ │ │ │
│ 1859 │ │ │ │ with self.accelerator.accumulate(model): │
│ ❱ 1860 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │
│ 1861 │ │ │ │ │
│ 1862 │ │ │ │ if ( │
│ 1863 │ │ │ │ │ args.logging_nan_inf_filter │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2734 in │
│ training_step │
│ 2731 │ │ │ with amp.scale_loss(loss, self.optimizer) as scaled_loss: │
│ 2732 │ │ │ │ scaled_loss.backward() │
│ 2733 │ │ else: │
│ ❱ 2734 │ │ │ self.accelerator.backward(loss) │
│ 2735 │ │ │
│ 2736 │ │ return loss.detach() / self.args.gradient_accumulation_steps │
│ 2737 │
│ /opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py:1821 in │
│ backward │
│ 1818 │ │ elif self.scaler is not None: │
│ 1819 │ │ │ self.scaler.scale(loss).backward(**kwargs) │
│ 1820 │ │ else: │
│ ❱ 1821 │ │ │ loss.backward(**kwargs) │
│ 1822 │ │
│ 1823 │ def unscale_gradients(self, optimizer=None): │
│ 1824 │ │ """ │
│ /opt/conda/lib/python3.10/site-packages/torch/_tensor.py:487 in backward │
│ 484 │ │ │ │ create_graph=create_graph, │
│ 485 │ │ │ │ inputs=inputs, │
│ 486 │ │ │ ) │
│ ❱ 487 │ │ torch.autograd.backward( │
│ 488 │ │ │ self, gradient, retain_graph, create_graph, inputs=inputs │
│ 489 │ │ ) │
│ 490 │
│ /opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:200 in │
│ 197 │ # The reason we repeat same the comment below is that │
│ 198 │ # some Python versions print out the first line of a multi-line fu │
│ 199 │ # calls in the traceback and some print out the last line │
│ ❱ 200 │ Variable._execution_engine.run_backward( # Calls into the C++ eng │
│ 201 │ │ tensors, grad_tensors_, retain_graph, create_graph, inputs, │
│ 202 │ │ allow_unreachable=True, accumulate_grad=True) # Calls into th │
│ 203 │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: one of the variables needed for gradient computation has been
modified by an inplace operation: [torch.cuda.FloatTensor [8, 16, 512, 512]],
which is output 0 of SoftmaxBackward0, is at version 1; expected version 0
instead. Hint: enable anomaly detection to find the operation that failed to
compute its gradient, with torch.autograd.set_detect_anomaly(True).
To narrow it down: with enabled anomaly detection
/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:200: UserWarning: Error detected in SoftmaxBackward0. Traceback of forward call that caused the error:
File "/opt/ml/code/entry_point.py", line 17, in <module>
main()
File "/opt/ml/code/ds_text2text_sci_bench/train.py", line 295, in main
model = train_causal_model(args, hyper_conf, tokenizer, train_ds, val_ds)
File "/opt/ml/code/ds_text2text_sci_bench/train.py", line 384, in train_causal_model
trainer.train()
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1555, in train
return inner_training_loop(
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1860, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2725, in training_step
loss = self.compute_loss(model, inputs)
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2748, in compute_loss
outputs = model(**inputs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py", line 553, in forward
return model_forward(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/accelerate/utils/operations.py", line 541, in __call__
return convert_to_fp32(self.model_forward(*args, **kwargs))
File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast
return func(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/20240201-lr1_0e2-warm-sched-biomed-350M-var_seq1024ctx_15B/modeling_mpt.py", line 317, in forward
outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/20240201-lr1_0e2-warm-sched-biomed-350M-var_seq1024ctx_15B/modeling_mpt.py", line 236, in forward
(x, attn_weights, present) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=self.is_causal, output_attentions=bool(output_attentions))
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/20240201-lr1_0e2-warm-sched-biomed-350M-var_seq1024ctx_15B/blocks.py", line 35, in forward
(b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, rotary_emb_w_meta_info=rotary_emb_w_meta_info, attention_mask=attention_mask, is_causal=is_causal, needs_weights=output_attentions)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/20240201-lr1_0e2-warm-sched-biomed-350M-var_seq1024ctx_15B/attention.py", line 300, in forward
(context, attn_weights, past_key_value) = self.attn_fn(query, key, value, self.n_heads, self.kv_n_heads, past_key_value=past_key_value, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights)
File "/root/.cache/huggingface/modules/transformers_modules/20240201-lr1_0e2-warm-sched-biomed-350M-var_seq1024ctx_15B/attention.py", line 96, in scaled_multihead_dot_product_attention
attn_weight = torch.softmax(attn_weight, dim=-1)
(Triggered internally at /opt/conda/conda-bld/pytorch_1679586020379/work/torch/csrc/autograd/python_anomaly_mode.cpp:114.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "/opt/ml/code/entry_point.py", line 17, in <module>
main()
File "/opt/ml/code/ds_text2text_sci_bench/train.py", line 295, in main
model = train_causal_model(args, hyper_conf, tokenizer, train_ds, val_ds)
File "/opt/ml/code/ds_text2text_sci_bench/train.py", line 384, in train_causal_model
trainer.train()
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1555, in train
return inner_training_loop(
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1860, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 2734, in training_step
self.accelerator.backward(loss)
File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 1821, in backward
loss.backward(**kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 487, in backward
torch.autograd.backward(
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 200, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [8, 16, 512, 512]], which is output 0 of SoftmaxBackward0, is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /opt/ml/code/entry_point.py:17 in <module> │
│ │
│ 14 │ print("Parent directory: %s" % os.path.abspath("..")) │
│ 15 │ print(os.listdir("..")) │
│ 16 │ │
│ ❱ 17 │ main() │
│ 18 │
│ │
│ /opt/ml/code/ds_text2text_sci_bench/train.py:295 in main │
│ │
│ 292 │ │ │ val_ds = ds['validation'] │
│ 293 │ │ else: │
│ 294 │ │ │ val_ds = None │
│ ❱ 295 │ │ model = train_causal_model(args, hyper_conf, tokenizer, train_ │
│ 296 │ │ │
│ 297 │ │ model.eval() │
│ 298 │
│ │
│ /opt/ml/code/ds_text2text_sci_bench/train.py:384 in train_causal_model │
│ │
│ 381 │ ) │
│ 382 │ trainer = Trainer(model=model, tokenizer=tokenizer, args=training_ │
│ 383 │ │ │ │ │ data_collator=data_collator) │
│ ❱ 384 │ trainer.train() │
│ 385 │ │
│ 386 │ trainer.save_model(args.output_dir) │
│ 387 │
│ │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1555 in │
│ train │
│ │
│ 1552 │ │ │ finally: │
│ 1553 │ │ │ │ hf_hub_utils.enable_progress_bars() │
│ 1554 │ │ else: │
│ ❱ 1555 │ │ │ return inner_training_loop( │
│ 1556 │ │ │ │ args=args, │
│ 1557 │ │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1558 │ │ │ │ trial=trial, │
│ │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1860 in │
│ _inner_training_loop │
│ │
│ 1857 │ │ │ │ │ self.control = self.callback_handler.on_step_begi │
│ 1858 │ │ │ │ │
│ 1859 │ │ │ │ with self.accelerator.accumulate(model): │
│ ❱ 1860 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │
│ 1861 │ │ │ │ │
│ 1862 │ │ │ │ if ( │
│ 1863 │ │ │ │ │ args.logging_nan_inf_filter │
│ │
│ /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2734 in │
│ training_step │
│ │
│ 2731 │ │ │ with amp.scale_loss(loss, self.optimizer) as scaled_loss: │
│ 2732 │ │ │ │ scaled_loss.backward() │
│ 2733 │ │ else: │
│ ❱ 2734 │ │ │ self.accelerator.backward(loss) │
│ 2735 │ │ │
│ 2736 │ │ return loss.detach() / self.args.gradient_accumulation_steps │
│ 2737 │
│ │
│ /opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py:1821 in │
│ backward │
│ │
│ 1818 │ │ elif self.scaler is not None: │
│ 1819 │ │ │ self.scaler.scale(loss).backward(**kwargs) │
│ 1820 │ │ else: │
│ ❱ 1821 │ │ │ loss.backward(**kwargs) │
│ 1822 │ │
│ 1823 │ def unscale_gradients(self, optimizer=None): │
│ 1824 │ │ """ │
│ │
│ /opt/conda/lib/python3.10/site-packages/torch/_tensor.py:487 in backward │
│ │
│ 484 │ │ │ │ create_graph=create_graph, │
│ 485 │ │ │ │ inputs=inputs, │
│ 486 │ │ │ ) │
│ ❱ 487 │ │ torch.autograd.backward( │
│ 488 │ │ │ self, gradient, retain_graph, create_graph, inputs=inputs │
│ 489 │ │ ) │
│ 490 │
│ │
│ /opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:200 in │
│ backward │
│ │
│ 197 │ # The reason we repeat same the comment below is that │
│ 198 │ # some Python versions print out the first line of a multi-line fu │
│ 199 │ # calls in the traceback and some print out the last line │
│ ❱ 200 │ Variable._execution_engine.run_backward( # Calls into the C++ eng │
│ 201 │ │ tensors, grad_tensors_, retain_graph, create_graph, inputs, │
│ 202 │ │ allow_unreachable=True, accumulate_grad=True) # Calls into th │
│ 203 │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: one of the variables needed for gradient computation has been
modified by an inplace operation: [torch.cuda.FloatTensor [8, 16, 512, 512]],
which is output 0 of SoftmaxBackward0, is at version 1; expected version 0
instead. Hint: the backtrace further above shows the operation that failed to
compute its gradient. The variable in question was changed in there or anywhere
later. Good luck!
Expected behavior
Fine-tuning with dropout works.
Versions
exported likely with llm-foundry v.0.4.0; how do I find that out quickly after exporting?
Context
We have a MPT MoD prefix-lm trained on llm-foundry and then exported to HuggingFace (via your scripts). For some fine-tuning experiments with the HF model, I tried to set up dropout.
However, as soon as I do, the model crashes during training. The training works fine without setting the the dropout parameter.
To reproduce
Exception
To narrow it down: with enabled anomaly detection
Expected behavior
Fine-tuning with dropout works.
Versions