unslothai / unsloth

Finetune Llama 3.1, Mistral, Phi & Gemma LLMs 2-5x faster with 80% less memory
https://unsloth.ai
Apache License 2.0
15.41k stars 1.04k forks source link

trainer.train() AttributeError: 'NoneType' object has no attribute 'absmax' #863

Open yangwendy opened 1 month ago

yangwendy commented 1 month ago

from trl import SFTTrainer from transformers import TrainingArguments from unsloth import is_bfloat16_supported

trainer = SFTTrainer( model = model, tokenizer = tokenizer, train_dataset = my_dataset, dataset_text_field = "text", max_seq_length = max_seq_length, dataset_num_proc = 2, packing = False, # Can make training 5x faster for short sequences. args = TrainingArguments( per_device_train_batch_size = 2, gradient_accumulation_steps = 4, warmup_steps = 5, max_steps = 60, learning_rate = 2e-4, fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(), logging_steps = 10, optim = "adamw_8bit", weight_decay = 0.01, lr_scheduler_type = "linear", seed = 3407, output_dir = "outputs", ), )

==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1 \ /| Num examples = 51,760 | Num Epochs = 1 O^O/ _/ \ Batch size per device = 2 | Gradient Accumulation steps = 4 \ / Total batch size = 8 | Total steps = 60 "-____-" Number of trainable parameters = 41,943,040

AttributeError Traceback (most recent call last) Cell In[21], line 1 ----> 1 trainer.train()

File :126, in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)

File :363, in _fast_inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/transformers/trainer.py:3318, in Trainer.training_step(self, model, inputs) 3315 return loss_mb.reduce_mean().detach().to(self.args.device) 3317 with self.compute_loss_context_manager(): -> 3318 loss = self.compute_loss(model, inputs) 3320 del inputs 3321 if ( 3322 self.args.torch_empty_cache_steps is not None 3323 and self.state.global_step % self.args.torch_empty_cache_steps == 0 3324 ):

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/transformers/trainer.py:3363, in Trainer.compute_loss(self, model, inputs, return_outputs) 3361 else: 3362 labels = None -> 3363 outputs = model(**inputs) 3364 # Save past state if it exists 3365 # TODO: this needs to be fixed and made cleaner later. 3366 if self.args.past_index >= 0:

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, kwargs) 1530 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1531 else: -> 1532 return self._call_impl(args, kwargs)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, *kwargs) 1536 # If we don't have any hooks, we want to skip the rest of the logic in 1537 # this function, and just call forward. 1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1539 or _global_backward_pre_hooks or _global_backward_hooks 1540 or _global_forward_hooks or _global_forward_pre_hooks): -> 1541 return forward_call(args, **kwargs) 1543 try: 1544 result = None

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/accelerate/utils/operations.py:819, in convert_outputs_to_fp32..forward(*args, kwargs) 818 def forward(*args, *kwargs): --> 819 return model_forward(args, kwargs)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/accelerate/utils/operations.py:807, in ConvertOutputsToFp32.call(self, *args, kwargs) 806 def call(self, *args, *kwargs): --> 807 return convert_to_fp32(self.model_forward(args, kwargs))

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/amp/autocast_mode.py:16, in autocast_decorator..decorate_autocast(*args, kwargs) 13 @functools.wraps(func) 14 def decorate_autocast(*args, *kwargs): 15 with autocast_instance: ---> 16 return func(args, kwargs)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/accelerate/utils/operations.py:819, in convert_outputs_to_fp32..forward(*args, kwargs) 818 def forward(*args, *kwargs): --> 819 return model_forward(args, kwargs)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/accelerate/utils/operations.py:807, in ConvertOutputsToFp32.call(self, *args, kwargs) 806 def call(self, *args, *kwargs): --> 807 return convert_to_fp32(self.model_forward(args, kwargs))

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/amp/autocast_mode.py:16, in autocast_decorator..decorate_autocast(*args, kwargs) 13 @functools.wraps(func) 14 def decorate_autocast(*args, *kwargs): 15 with autocast_instance: ---> 16 return func(args, kwargs)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/unsloth/models/llama.py:964, in PeftModelForCausalLM_fast_forward(self, input_ids, causal_mask, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, kwargs) 951 def PeftModelForCausalLM_fast_forward( 952 self, 953 input_ids=None, (...) 962 kwargs, 963 ): --> 964 return self.base_model( 965 input_ids=input_ids, 966 causal_mask=causal_mask, 967 attention_mask=attention_mask, 968 inputs_embeds=inputs_embeds, 969 labels=labels, 970 output_attentions=output_attentions, 971 output_hidden_states=output_hidden_states, 972 return_dict=return_dict, 973 **kwargs, 974 )

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, kwargs) 1530 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1531 else: -> 1532 return self._call_impl(args, kwargs)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, *kwargs) 1536 # If we don't have any hooks, we want to skip the rest of the logic in 1537 # this function, and just call forward. 1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1539 or _global_backward_pre_hooks or _global_backward_hooks 1540 or _global_forward_hooks or _global_forward_pre_hooks): -> 1541 return forward_call(args, **kwargs) 1543 try: 1544 result = None

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/peft/tuners/tuners_utils.py:161, in BaseTuner.forward(self, *args, kwargs) 160 def forward(self, *args: Any, *kwargs: Any): --> 161 return self.model.forward(args, kwargs)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/accelerate/hooks.py:169, in add_hook_to_module..new_forward(module, *args, kwargs) 167 output = module._old_forward(*args, *kwargs) 168 else: --> 169 output = module._old_forward(args, kwargs) 170 return module._hf_hook.post_forward(module, output)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/unsloth/models/llama.py:883, in CausalLM_fast_forward.._CausalLM_fast_forward(self, input_ids, causal_mask, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, *args, **kwargs) 880 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) 881 self.model._has_no_labels = labels is None --> 883 outputs = self.model( 884 input_ids=input_ids, 885 causal_mask=causal_mask, 886 attention_mask=attention_mask, 887 position_ids=position_ids, 888 past_key_values=past_key_values, 889 inputs_embeds=inputs_embeds, 890 use_cache=use_cache, 891 output_attentions=output_attentions, 892 output_hidden_states=output_hidden_states, 893 return_dict=return_dict, 894 ) 895 pass 897 hidden_states = outputs[0]

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, kwargs) 1530 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1531 else: -> 1532 return self._call_impl(args, kwargs)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, *kwargs) 1536 # If we don't have any hooks, we want to skip the rest of the logic in 1537 # this function, and just call forward. 1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1539 or _global_backward_pre_hooks or _global_backward_hooks 1540 or _global_forward_hooks or _global_forward_pre_hooks): -> 1541 return forward_call(args, **kwargs) 1543 try: 1544 result = None

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/accelerate/hooks.py:169, in add_hook_to_module..new_forward(module, *args, kwargs) 167 output = module._old_forward(*args, *kwargs) 168 else: --> 169 output = module._old_forward(args, kwargs) 170 return module._hf_hook.post_forward(module, output)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/unsloth/models/llama.py:720, in LlamaModel_fast_forward(self, input_ids, causal_mask, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, *args, **kwargs) 717 if IS_GEMMA2: mask = self.SWA_mask if (idx % 2 == 0) else self.GA_mask 719 if offloaded_gradient_checkpointing: --> 720 hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply( 721 decoder_layer, 722 hidden_states, 723 mask, 724 attention_mask, 725 position_ids, 726 past_key_values, 727 output_attentions, 728 use_cache, 729 )[0] 731 elif gradient_checkpointing: 732 def create_custom_forward(module):

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/autograd/function.py:598, in Function.apply(cls, *args, *kwargs) 595 if not torch._C._are_functorch_transforms_active(): 596 # See NOTE: [functorch vjp and autograd interaction] 597 args = _functorch.utils.unwrap_dead_wrappers(args) --> 598 return super().apply(args, **kwargs) # type: ignore[misc] 600 if not is_setup_ctx_defined: 601 raise RuntimeError( 602 "In order to use an autograd.Function with functorch transforms " 603 "(vmap, grad, jvp, jacrev, ...), it must override the setup_context " 604 "staticmethod. For more details, please see " 605 "https://pytorch.org/docs/master/notes/extending.func.html" 606 )

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/cuda/amp/autocast_mode.py:115, in custom_fwd..decorate_fwd(*args, *kwargs) 113 if cast_inputs is None: 114 args[0]._fwd_used_autocast = torch.is_autocast_enabled() --> 115 return fwd(args, **kwargs) 116 else: 117 autocast_context = torch.is_autocast_enabled()

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/unsloth/models/_utils.py:659, in Unsloth_Offloaded_Gradient_Checkpointer.forward(ctx, forward_function, hidden_states, args) 657 saved_hidden_states = hidden_states.to("cpu", non_blocking = True) 658 with torch.no_grad(): --> 659 output = forward_function(hidden_states, args) 660 ctx.save_for_backward(saved_hidden_states) 661 ctx.forward_function = forward_function

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, kwargs) 1530 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1531 else: -> 1532 return self._call_impl(args, kwargs)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, *kwargs) 1536 # If we don't have any hooks, we want to skip the rest of the logic in 1537 # this function, and just call forward. 1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1539 or _global_backward_pre_hooks or _global_backward_hooks 1540 or _global_forward_hooks or _global_forward_pre_hooks): -> 1541 return forward_call(args, **kwargs) 1543 try: 1544 result = None

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/accelerate/hooks.py:169, in add_hook_to_module..new_forward(module, *args, kwargs) 167 output = module._old_forward(*args, *kwargs) 168 else: --> 169 output = module._old_forward(args, kwargs) 170 return module._hf_hook.post_forward(module, output)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/unsloth/models/llama.py:467, in LlamaDecoderLayer_fast_forward(self, hidden_states, causal_mask, attention_mask, position_ids, past_key_value, output_attentions, use_cache, padding_mask, *args, **kwargs) 465 residual = hidden_states 466 hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states) --> 467 hidden_states, self_attn_weights, present_key_value = self.self_attn( 468 hidden_states=hidden_states, 469 causal_mask=causal_mask, 470 attention_mask=attention_mask, 471 position_ids=position_ids, 472 past_key_value=past_key_value, 473 output_attentions=output_attentions, 474 use_cache=use_cache, 475 padding_mask=padding_mask, 476 ) 477 hidden_states = residual + hidden_states 479 # Fully Connected

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, kwargs) 1530 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1531 else: -> 1532 return self._call_impl(args, kwargs)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, *kwargs) 1536 # If we don't have any hooks, we want to skip the rest of the logic in 1537 # this function, and just call forward. 1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1539 or _global_backward_pre_hooks or _global_backward_hooks 1540 or _global_forward_hooks or _global_forward_pre_hooks): -> 1541 return forward_call(args, **kwargs) 1543 try: 1544 result = None

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/accelerate/hooks.py:169, in add_hook_to_module..new_forward(module, *args, kwargs) 167 output = module._old_forward(*args, *kwargs) 168 else: --> 169 output = module._old_forward(args, kwargs) 170 return module._hf_hook.post_forward(module, output)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/unsloth/models/llama.py:339, in LlamaAttention_fast_forward(self, hidden_states, causal_mask, attention_mask, position_ids, past_key_value, output_attentions, use_cache, padding_mask, *args, *kwargs) 336 head_dim = self.head_dim 337 assert(n_kv_heads n_groups == n_heads) --> 339 Q, K, V = self.apply_qkv(self, hidden_states) 340 Q = Q.view(bsz, q_len, n_heads, head_dim).transpose(1, 2) 341 K = K.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/unsloth/kernels/fast_lora.py:320, in apply_lora_qkv(self, X) 318 KW, KW_quant, KA, KB, KS = get_lora_parameters(self.k_proj) 319 VW, VW_quant, VA, VB, VS = get_lora_parameters(self.v_proj) --> 320 Q, K, V = LoRA_QKV.apply(X, 321 QW, QW_quant, QA, QB, QS, 322 KW, KW_quant, KA, KB, KS, 323 VW, VW_quant, VA, VB, VS, 324 ) 325 return Q, K, V

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/autograd/function.py:598, in Function.apply(cls, *args, *kwargs) 595 if not torch._C._are_functorch_transforms_active(): 596 # See NOTE: [functorch vjp and autograd interaction] 597 args = _functorch.utils.unwrap_dead_wrappers(args) --> 598 return super().apply(args, **kwargs) # type: ignore[misc] 600 if not is_setup_ctx_defined: 601 raise RuntimeError( 602 "In order to use an autograd.Function with functorch transforms " 603 "(vmap, grad, jvp, jacrev, ...), it must override the setup_context " 604 "staticmethod. For more details, please see " 605 "https://pytorch.org/docs/master/notes/extending.func.html" 606 )

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/torch/cuda/amp/autocast_mode.py:115, in custom_fwd..decorate_fwd(*args, *kwargs) 113 if cast_inputs is None: 114 args[0]._fwd_used_autocast = torch.is_autocast_enabled() --> 115 return fwd(args, **kwargs) 116 else: 117 autocast_context = torch.is_autocast_enabled()

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/unsloth/kernels/fast_lora.py:235, in LoRA_QKV.forward(ctx, X, QW, QW_quant, QA, QB, QS, KW, KW_quant, KA, KB, KS, VW, VW_quant, VA, VB, VS) 227 @staticmethod 228 @torch_amp_custom_fwd 229 def forward(ctx, X : torch.Tensor, 230 QW, QW_quant, QA, QB, QS, 231 KW, KW_quant, KA, KB, KS, 232 VW, VW_quant, VA, VB, VS,): 233 dtype = X.dtype --> 235 Q = matmul_lora(X, QW, QW_quant, QA, QB, QS) 236 K = matmul_lora(X, KW, KW_quant, KA, KB, KS) 237 V = matmul_lora(X, VW, VW_quant, VA, VB, VS)

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/unsloth/kernels/utils.py:269, in matmul_lora(X, W, W_quant, A, B, s, out) 267 def matmul_lora(X, W, W_quant, A, B, s, out = None): 268 dtype = X.dtype --> 269 W = fast_dequantize(W.t(), W_quant) 271 if X.dim() == 3: 272 batch, seq_len, d = X.shape

File /anaconda/envs/azureml_py38/lib/python3.9/site-packages/unsloth/kernels/utils.py:119, in fast_dequantize(W, quant_state, out) 117 offset = quant_state.offset 118 state2 = quant_state.state2 --> 119 absmax2 = state2.absmax 120 code2 = state2.code 121 blocksize2 = state2.blocksize

AttributeError: 'NoneType' object has no attribute 'absmax'

danielhanchen commented 1 month ago

Ok this is a weird one! Hmmm I'm assuming the quantization is not using double quantization nf4 but just a single quant?

abedkhooli commented 3 weeks ago

I also see same on a model I quantized using bnb (4bits). I suspect it is related to the model itself although it is based on Llama-2. Base model is https://huggingface.co/inceptionai/jais-adapted-13b-chat
Also same error with quantized model from https://huggingface.co/FreedomIntelligence/AceGPT-v1.5-13B-Chat
** update: got both to work with NF4 but useless output. Not sure if supported.

danielhanchen commented 2 weeks ago

Hmm unsure on the bad outputs - did you use FastLanguageModel.for_inference(model)

abedkhooli commented 2 weeks ago

@danielhanchen yes. I feel Unsloth quantization is different than BitsAndBytes and this could cause that. I get much better output with Qwen2 although the above models should be better.

danielhanchen commented 2 weeks ago

Oh we use BnB directly