Open osadchi opened 1 month ago
I actually have a question. To train models llama-factory uses unsloth with triton. Triton compiles it's own code to GPU, so if zluda works with some kind of translation (it's too complicated for me :3) is it actually possible to make triton works through zluda with amd cards?
I did not look into Triton yet, but most likely yes. The biggest question is performance
May be it well be needed the future, when trying to load a model VRAM starts loading then this happens <3 It's with old version for 5.1 rocm with rx6600m (I've replaced cublas and cusparese dll in torch folder)
Traceback (most recent call last):
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\queueing.py", line 575, in process_events
response = await route_utils.call_process_api(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
output = await app.get_blocks().process_api(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\blocks.py", line 1935, in process_api
result = await self.call_function(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\blocks.py", line 1532, in call_function
prediction = await utils.async_iteration(iterator)
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\utils.py", line 671, in async_iteration
return await iterator.__anext__()
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\utils.py", line 664, in __anext__
return await anyio.to_thread.run_sync(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\anyio\to_thread.py", line 56, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\anyio\_backends\_asyncio.py", line 2441, in run_sync_in_worker_thread
return await future
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\anyio\_backends\_asyncio.py", line 943, in run
result = context.run(func, *args)
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\utils.py", line 647, in run_sync_iterator_async
return next(iterator)
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\utils.py", line 809, in gen_wrapper
response = next(iterator)
File "C:\LLaMA-Factory\src\llamafactory\webui\chatter.py", line 104, in load_model
super().__init__(args)
File "C:\LLaMA-Factory\src\llamafactory\chat\chat_model.py", line 52, in __init__
self.engine: "BaseEngine" = HuggingfaceEngine(model_args, data_args, finetuning_args, generating_args)
File "C:\LLaMA-Factory\src\llamafactory\chat\hf_engine.py", line 59, in __init__
self.model = load_model(
File "C:\LLaMA-Factory\src\llamafactory\model\loader.py", line 168, in load_model
model = init_adapter(config, model, model_args, finetuning_args, is_trainable)
File "C:\LLaMA-Factory\src\llamafactory\model\adapter.py", line 299, in init_adapter
model = _setup_lora_tuning(
File "C:\LLaMA-Factory\src\llamafactory\model\adapter.py", line 189, in _setup_lora_tuning
model = load_unsloth_peft_model(config, model_args, is_trainable=is_trainable)
File "C:\LLaMA-Factory\src\llamafactory\model\model_utils\unsloth.py", line 95, in load_unsloth_peft_model
model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\unsloth\models\loader.py", line 172, in from_pretrained
model, tokenizer = dispatch_model.from_pretrained(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\unsloth\models\llama.py", line 1186, in from_pretrained
model = AutoModelForCausalLM.from_pretrained(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\transformers\models\auto\auto_factory.py", line 564, in from_pretrained
return model_class.from_pretrained(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\transformers\modeling_utils.py", line 3838, in from_pretrained
) = cls._load_pretrained_model(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\transformers\modeling_utils.py", line 4298, in _load_pretrained_model
new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\transformers\modeling_utils.py", line 897, in _load_state_dict_into_meta_model
hf_quantizer.create_quantized_param(model, param, param_name, param_device, state_dict, unexpected_keys)
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\transformers\quantizers\quantizer_bnb_4bit.py", line 217, in create_quantized_param
new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device)
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\bitsandbytes\nn\modules.py", line 191, in to
return self.cuda(device)
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\bitsandbytes\nn\modules.py", line 169, in cuda
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\bitsandbytes\functional.py", line 936, in quantize_4bit
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
RuntimeError: CUDA error: operation not supported
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.```
When using new zluda it loads just fine but when trying to prompt
```Loading checkpoint shards: 100%|████████████████████████| 4/4 [00:04<00:00, 1.19s/it]
[INFO|modeling_utils.py:4364] 2024-10-21 06:41:36,139 >> All model checkpoint weights were used when initializing LlamaForCausalLM.
[INFO|modeling_utils.py:4372] 2024-10-21 06:41:36,139 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at C:\llama-3-8b-gpt-4o-ru1.0.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
[INFO|configuration_utils.py:953] 2024-10-21 06:41:36,141 >> loading configuration file C:\llama-3-8b-gpt-4o-ru1.0\generation_config.json
[INFO|configuration_utils.py:1000] 2024-10-21 06:41:36,142 >> Generate config GenerationConfig {
"bos_token_id": 128000,
"do_sample": true,
"eos_token_id": [
128001,
128009
],
"max_length": 4096,
"temperature": 0.6,
"top_p": 0.9
}
Some parameters are on the meta device because they were offloaded to the cpu.
10/21/2024 06:41:36 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
10/21/2024 06:41:36 - INFO - llamafactory.model.loader - all params: 8,030,261,248
10/21/2024 06:41:36 - WARNING - llamafactory.chat.hf_engine - There is no current event loop, creating a new one.
Traceback (most recent call last):
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\queueing.py", line 536, in process_events
response = await route_utils.call_process_api(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
output = await app.get_blocks().process_api(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\blocks.py", line 1935, in process_api
result = await self.call_function(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\blocks.py", line 1532, in call_function
prediction = await utils.async_iteration(iterator)
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\utils.py", line 671, in async_iteration
return await iterator.__anext__()
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\utils.py", line 664, in __anext__
return await anyio.to_thread.run_sync(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\anyio\to_thread.py", line 56, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\anyio\_backends\_asyncio.py", line 2441, in run_sync_in_worker_thread
return await future
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\anyio\_backends\_asyncio.py", line 943, in run
result = context.run(func, *args)
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\utils.py", line 647, in run_sync_iterator_async
return next(iterator)
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\gradio\utils.py", line 809, in gen_wrapper
response = next(iterator)
File "C:\LLaMA-Factory\src\llamafactory\webui\chatter.py", line 143, in stream
for new_text in self.stream_chat(
File "C:\LLaMA-Factory\src\llamafactory\chat\chat_model.py", line 109, in stream_chat
yield task.result()
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\concurrent\futures\_base.py", line 458, in result
return self.__get_result()
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\concurrent\futures\_base.py", line 403, in __get_result
raise self._exception
File "C:\LLaMA-Factory\src\llamafactory\chat\chat_model.py", line 125, in astream_chat
async for new_token in self.engine.stream_chat(messages, system, tools, image, video, **input_kwargs):
File "C:\LLaMA-Factory\src\llamafactory\chat\hf_engine.py", line 323, in stream_chat
stream = self._stream_chat(*input_args)
File "C:\ProgramData\anaconda3\envs\unsloth_env\lib\site-packages\torch\utils\_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "C:\LLaMA-Factory\src\llamafactory\chat\hf_engine.py", line 225, in _stream_chat
gen_kwargs, _ = HuggingfaceEngine._process_args(
File "C:\LLaMA-Factory\src\llamafactory\chat\hf_engine.py", line 108, in _process_args
attention_mask = torch.ones_like(inputs, dtype=torch.bool)
RuntimeError: CUDA error: operation not supported
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.```
Will it some day work with llama-factory for training models? I've run webui, it loads model but cant calc the prompt saying cuda func unsupported :C