Closed CurtiusSimplus closed 2 weeks ago
Hey! Sorry on the issue - can you copy paste the
TrainingArguments(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4, ...
part in your code thanks
Yes no worries:
from trl import SFTTrainer from transformers import TrainingArguments from unsloth import is_bfloat16_supported
trainer = SFTTrainer( model = model, tokenizer = tokenizer, train_dataset = dataset, dataset_text_field = "text", max_seq_length = max_seq_length, dataset_num_proc = 2, packing = False, # Can make training 5x faster for short sequences. args = TrainingArguments( per_device_train_batch_size = 2, gradient_accumulation_steps = 8, gradient_checkpointing = True, warmup_steps = 100,
max_steps = 1000, # Set this for 60 steps
learning_rate = 2e-4,
fp16 = not is_bfloat16_supported(),
bf16 = is_bfloat16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear", #cosine is another option
seed = 3407,
output_dir = "outputs",
report_to = "none", # Use this for WandB etc
save_strategy = "steps",
save_steps = 10,
save_total_limit = 1,
push_to_hub = False,
),
)
The Model is the 4 bit Nemo you provide as an example -- so it is not an exotic architecture.
Oh yep can reproduce the issue - sorry about that!
Just fixed it! Please update Unsloth without changing dependencies if on local machines - Colab / Kaggle just delete the runtime and start again
pip uninstall unsloth unsloth-zoo -y && pip install --upgrade --no-cache-dir --no-deps unsloth unsloth-zoo
So IF on Colab ... Just restart? Same CODE or use the 'unsloth-zoo' code?
Oh disconnect and delete runtime and ye restart
Seems to be working again ... You get right on it. :)
Here is the complete error ... ==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1 \ /| Num examples = 21,399 | Num Epochs = 2 O^O/ _/ \ Batch size per device = 4 | Gradient Accumulation steps = 8 \ / Total batch size = 32 | Total steps = 1,000 "-____-" Number of trainable parameters = 114,032,640
RuntimeError Traceback (most recent call last) in <cell line: 1>()
----> 1 trainer_stats = trainer.train()
2 #trainer_stats = trainer.train(resume_from_checkpoint = True)
3 #from unsloth import unsloth_train
4 #trainer_stats = unsloth_train(trainer)
14 frames /usr/local/lib/python3.10/dist-packages/unsloth/tokenizer_utils.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
/usr/local/lib/python3.10/dist-packages/unsloth/models/llama.py in _fast_inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
/usr/local/lib/python3.10/dist-packages/unsloth/models/_utils.py in _unsloth_training_step(failed resolving arguments)
/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py in backward(self, loss, kwargs) 2194 self.lomo_backward(loss, learning_rate) 2195 else: -> 2196 loss.backward(kwargs) 2197 2198 def set_trigger(self):
/usr/local/lib/python3.10/dist-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs) 579 inputs=inputs, 580 ) --> 581 torch.autograd.backward( 582 self, gradient, retain_graph, create_graph, inputs=inputs 583 )
/usr/local/lib/python3.10/dist-packages/torch/autograd/init.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs) 345 # some Python versions print out the first line of a multi-line function 346 # calls in the traceback and some print out the last line --> 347 _engine_run_backward( 348 tensors, 349 gradtensors,
/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py in _engine_run_backward(t_outputs, *args, *kwargs) 823 unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs) 824 try: --> 825 return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass 826 t_outputs, args, **kwargs 827 ) # Calls into the C++ engine to run the backward pass
/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py in apply(self, args) 305 ) 306 user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn --> 307 return user_fn(self, args) 308 309 def apply_jvp(self, *args):
/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py in decorate_bwd(*args, *kwargs) 509 dtype=args[0]._dtype, 510 ): --> 511 return bwd(args, **kwargs) 512 513 return decorate_bwd
/usr/local/lib/python3.10/dist-packages/unsloth_zoo/gradient_checkpointing.py in backward(ctx, dY) 156 with torch.enable_grad(): 157 (output,) = ctx.forward_function(hidden_states, ctx.args) --> 158 torch.autograd.backward(output, dY) 159 return (None, hidden_states.grad,) + (None,)len(ctx.args) 160 pass
/usr/local/lib/python3.10/dist-packages/torch/autograd/init.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs) 345 # some Python versions print out the first line of a multi-line function 346 # calls in the traceback and some print out the last line --> 347 _engine_run_backward( 348 tensors, 349 gradtensors,
/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py in _engine_run_backward(t_outputs, *args, *kwargs) 823 unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs) 824 try: --> 825 return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass 826 t_outputs, args, **kwargs 827 ) # Calls into the C++ engine to run the backward pass
/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py in apply(self, args) 305 ) 306 user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn --> 307 return user_fn(self, args) 308 309 def apply_jvp(self, *args):
/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py in decorate_bwd(*args, *kwargs) 509 dtype=args[0]._dtype, 510 ): --> 511 return bwd(args, **kwargs) 512 513 return decorate_bwd
/usr/local/lib/python3.10/dist-packages/unsloth/kernels/fast_lora.py in backward(ctx, dY) 134 # dX += matmul_lora(de, gateW.t(), gateW_quant, gateB, gateA, gateS) 135 upW = fast_dequantize(upW.t(), upW_quant) --> 136 dX = torch.matmul(df, upW.t(), out = X if ctx.inplace else None) 137 del upW 138 dX += df @ upB.to(dtype).t() @ (upS * upA.to(dtype).t())
RuntimeError: Expected out tensor to have dtype c10::BFloat16, but got float instead
Been doing this since last night sometime.
Unslot and transformers are as such:
%%capture !pip install unsloth "xformers==0.0.28.post2"
Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"