I have been trying to use the finetune.py script with my own dataset on a single H100 GPU with CUDA 11.8
I have been getting the following error.
The same script and dataset are working when using a machine with A100 and CUDA 11.7
Any ideas?
Thank you!
cuBLAS API failed with status 15
A: torch.Size([2048, 6144]), B: torch.Size([6400, 6144]), C: (2048, 6400); (lda, ldb, ldc): (c_int(65536), c_int(204800), c_int(65536)); (m, n, k): (c_int(2048), c_int(6400), c_int(6144))
error detectedTraceback (most recent call last):
File "StarCoder/finetune/finetune_h100.py", line 366, in
main(args)
File "StarCoder/finetune/finetune_h100.py", line 353, in main
run_training(args, train_dataset, eval_dataset)
File "StarCoder/finetune/finetune_h100.py", line 323, in run_training
trainer.train()
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/trainer.py", line 1664, in train
return inner_training_loop(
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/trainer.py", line 1940, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/trainer.py", line 2735, in training_step
loss = self.compute_loss(model, inputs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/trainer.py", line 2767, in compute_loss
outputs = model(inputs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/peft/peft_model.py", line 707, in forward
return self.base_model(
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 808, in forward
transformer_outputs = self.transformer(
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 663, in forward
outputs = torch.utils.checkpoint.checkpoint(
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 249, in checkpoint
return CheckpointFunction.apply(function, preserve, args)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(args, kwargs) # type: ignore[misc]
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 107, in forward
outputs = run_function(args)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 659, in custom_forward
return module(inputs, use_cache, output_attentions)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 316, in forward
attn_outputs = self.attn(
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 230, in forward
query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 self.kv_dim), dim=2)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/peft/tuners/lora.py", line 751, in forward
result = super().forward(x)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/bitsandbytes/nn/modules.py", line 388, in forward
out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 559, in matmul
return MatMul8bitLt.apply(A, B, out, bias, state)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, kwargs) # type: ignore[misc]
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 397, in forward
out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/bitsandbytes/functional.py", line 1781, in igemmlt
raise Exception('cublasLt ran into an error!')
Exception: cublasLt ran into an error!
Hello,
I have been trying to use the finetune.py script with my own dataset on a single H100 GPU with CUDA 11.8 I have been getting the following error. The same script and dataset are working when using a machine with A100 and CUDA 11.7 Any ideas?
Thank you!
cuBLAS API failed with status 15 A: torch.Size([2048, 6144]), B: torch.Size([6400, 6144]), C: (2048, 6400); (lda, ldb, ldc): (c_int(65536), c_int(204800), c_int(65536)); (m, n, k): (c_int(2048), c_int(6400), c_int(6144)) error detectedTraceback (most recent call last): File "StarCoder/finetune/finetune_h100.py", line 366, in
main(args)
File "StarCoder/finetune/finetune_h100.py", line 353, in main
run_training(args, train_dataset, eval_dataset)
File "StarCoder/finetune/finetune_h100.py", line 323, in run_training
trainer.train()
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/trainer.py", line 1664, in train
return inner_training_loop(
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/trainer.py", line 1940, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/trainer.py", line 2735, in training_step
loss = self.compute_loss(model, inputs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/trainer.py", line 2767, in compute_loss
outputs = model(inputs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/peft/peft_model.py", line 707, in forward
return self.base_model(
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 808, in forward
transformer_outputs = self.transformer(
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 663, in forward
outputs = torch.utils.checkpoint.checkpoint(
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 249, in checkpoint
return CheckpointFunction.apply(function, preserve, args)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(args, kwargs) # type: ignore[misc]
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 107, in forward
outputs = run_function(args)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 659, in custom_forward
return module(inputs, use_cache, output_attentions)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 316, in forward
attn_outputs = self.attn(
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py", line 230, in forward
query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 self.kv_dim), dim=2)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/peft/tuners/lora.py", line 751, in forward
result = super().forward(x)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/bitsandbytes/nn/modules.py", line 388, in forward
out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 559, in matmul
return MatMul8bitLt.apply(A, B, out, bias, state)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, kwargs) # type: ignore[misc]
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 397, in forward
out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)
File "anaconda3/envs/starcoder_train_11.8/lib/python3.10/site-packages/bitsandbytes/functional.py", line 1781, in igemmlt
raise Exception('cublasLt ran into an error!')
Exception: cublasLt ran into an error!