Open drummerv opened 4 months ago
Tried it again today on a 8x H100 SXM
Traceback (most recent call last):
File "/root/miniconda3/envs/py3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/root/miniconda3/envs/py3.10/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/workspace/axolotl/src/axolotl/cli/train.py", line 59, in <module>
fire.Fire(do_cli)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fire/core.py", line 143, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fire/core.py", line 477, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fire/core.py", line 693, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/workspace/axolotl/src/axolotl/cli/train.py", line 35, in do_cli
return do_train(parsed_cfg, parsed_cli_args)
File "/workspace/axolotl/src/axolotl/cli/train.py", line 53, in do_train
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
File "/workspace/axolotl/src/axolotl/cli/__init__.py", line 403, in load_datasets
train_dataset, eval_dataset, total_num_steps, prompters = prepare_dataset(
File "/workspace/axolotl/src/axolotl/utils/data/sft.py", line 57, in prepare_dataset
with zero_first(is_main_process()):
File "/root/miniconda3/envs/py3.10/lib/python3.10/contextlib.py", line 135, in __enter__
return next(self.gen)
File "/workspace/axolotl/src/axolotl/utils/distributed.py", line 68, in zero_first
barrier()
File "/workspace/axolotl/src/axolotl/utils/distributed.py", line 34, in barrier
dist.barrier()
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3696, in barrier
work = default_pg.barrier(opts=opts)
RuntimeError: CUDA error: uncorrectable ECC error encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Tried it on 8x H100 PCIE
Found it unusual that it got stuck on this part:
[2024-05-08 14:11:28,264] [INFO] [axolotl.load_tokenized_prepared_datasets:410] [PID:3641] [RANK:0] merging datasets
Been waiting for 10 minutes. I'm killing it.
A different error for 1xH100 SXM
warnings.warn(
/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
cuBLAS API failed with status 15
A: torch.Size([49152, 4096]), B: torch.Size([4096, 4096]), C: (49152, 4096); (lda, ldb, ldc): (c_int(1572864), c_int(131072), c_int(1572864)); (m, n, k): (c_int(49152), c_int(4096), c_int(4096))
Traceback (most recent call last):
File "/root/miniconda3/envs/py3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/root/miniconda3/envs/py3.10/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/workspace/axolotl/src/axolotl/cli/train.py", line 59, in <module>
fire.Fire(do_cli)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fire/core.py", line 143, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fire/core.py", line 477, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fire/core.py", line 693, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/workspace/axolotl/src/axolotl/cli/train.py", line 35, in do_cli
return do_train(parsed_cfg, parsed_cli_args)
File "/workspace/axolotl/src/axolotl/cli/train.py", line 55, in do_train
return train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
File "/workspace/axolotl/src/axolotl/train.py", line 170, in train
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/transformers/trainer.py", line 1780, in train
return inner_training_loop(
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/transformers/trainer.py", line 2118, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/transformers/trainer.py", line 3036, in training_step
loss = self.compute_loss(model, inputs)
File "/workspace/axolotl/src/axolotl/core/trainer_builder.py", line 493, in compute_loss
return super().compute_loss(model, inputs, return_outputs=return_outputs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/transformers/trainer.py", line 3059, in compute_loss
outputs = model(**inputs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/utils/operations.py", line 822, in forward
return model_forward(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/utils/operations.py", line 810, in __call__
return convert_to_fp32(self.model_forward(*args, **kwargs))
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
return func(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/peft/peft_model.py", line 1395, in forward
return self.base_model(
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 179, in forward
return self.model.forward(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 1196, in forward
outputs = self.model(
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/workspace/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py", line 809, in llama_model_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/_compile.py", line 24, in inner
return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 328, in _fn
return fn(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/_dynamo/external_utils.py", line 17, in inner
return fn(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 451, in checkpoint
return CheckpointFunction.apply(function, preserve, *args)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/autograd/function.py", line 539, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 230, in forward
outputs = run_function(*args)
File "/workspace/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py", line 803, in custom_forward
return module(
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/workspace/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py", line 902, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/workspace/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py", line 417, in flashattn_forward
query_states = self.q_proj(hidden_states)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/peft/tuners/lora/bnb.py", line 217, in forward
result = self.base_layer(x, *args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/bitsandbytes/nn/modules.py", line 687, in forward
out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 562, in matmul
return MatMul8bitLt.apply(A, B, out, bias, state)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/autograd/function.py", line 539, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 401, in forward
out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/bitsandbytes/functional.py", line 1967, in igemmlt
raise Exception('cublasLt ran into an error!')
Exception: cublasLt ran into an error!
@drummerv I don't know that the " RunPod's Axolotl Jupyter template" is the "official" correct template. This direct link should get you the correct image (https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz), although there is a bug w runpod using that link, so for now you can use this link: https://www.runpod.io/console/explore/v2ickqhz9s?ref=6i7fkpdz
@winglian Fixed it. I just had to rollback to an older commit:
git checkout 132eb740f036eff0fa8b239ddaf0b7a359ed1732
thanks, there are quite a few changes since then! Just going to drop this here so I can remember to look through the changeset later: https://github.com/OpenAccess-AI-Collective/axolotl/compare/132eb740f036eff0fa8b239ddaf0b7a359ed1732...main
@drummerv If it helps, I also just ran into this cuBLAS API failed with status 15
on a h100, and while going back to an old commit did not fix this (I tried probably 15 containers from around that time period), loading in 16 bit instead of 4 or 8 bit solved this issue (on main@HEAD).
This issue on alpaca-lora seems related: https://github.com/tloen/alpaca-lora/issues/174
Please check that this issue hasn't been reported before.
Expected Behavior
I ran Axolotl around two days ago and it worked fine. 8xH100 SXM using RunPod's Axolotl Jupyter template.
Current behaviour
When I ran the same config today, it gave me this error:
Steps to reproduce
Config yaml
Possible solution
Does the runpod / docker template use the latest commit? We can narrow it down to the last 1 to 2 days.
Which Operating Systems are you using?
Python Version
main-latest
axolotl branch-commit
main-latest
Acknowledgements