Open brando90 opened 1 month ago
maybe this?
unsloth_eos_token = "eos_token"
CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token, False, unsloth_ollama,)
pass
Maybe try to change this:
tokenizer = get_chat_template(
tokenizer,
chat_template = "Qwen2-1.5B",
mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)
To this:
custom_template="""{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"""
custom_eos_token = "eos_token"
tokenizer = get_chat_template(
tokenizer,
chat_template = (custom_template, custom_eos_token),
mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)
Since the eos token seems to be hard coded in the template (it might not be a good practice, since it may be tokenized differently), I'm not sur of what to do of the custom_eos_token.
I'll look over this again in the weekend - sorry on the delay!!
I'll look over this again in the weekend - sorry on the delay!!
let us know how it's going or if you need help! @danielhanchen
@danielhanchen I think I got it to work with this:
def test_unsloth_inference(
max_length: int = 8192,
use_4bit: bool = False,
):
""" interence notebook: https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing """
print('--> Running: test_unsloth_inference')
import os
from transformers import TextStreamer
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
# model_name = os.path.expanduser('~/data/runs/09192024_12h35m27s_run/train/checkpoint-820')
# model_name = 'unsloth/Qwen2-1.5B'
model_name = 'Qwen/Qwen2-1.5B'
model: FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_length,
dtype=None, # Auto-detection for Float16/BFloat16
load_in_4bit=use_4bit,
)
# tokenizer = get_chat_template(
# tokenizer,
# chat_template = "Qwen2-1.5B",
# mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
# )
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# messages = [{"role": "human", "value": "Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,"}]
prompt = "Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,"
print('prompt: ', prompt)
messages = [
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": prompt}
]
# inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# st()
print('text: ', text)
inputs = tokenizer([text], return_tensors="pt").to(model.device)
# print(f'{inputs.size()=}')
text_streamer = TextStreamer(tokenizer)
# st()
res = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)
print(f'{res=}')
completion: str = tokenizer.decode(res[0])
print(f'{completion=}')
if __name__ == "__main__":
import fire
import time
print('\n-- Start')
start_time = time.time()
# fire.Fire(test_unsloth_vllm)
# fire.Fire(test_unsloth_inference)
# fire.Fire(test_unsloth_plus_hf_inference)
fire.Fire(test_unsloth_inference)
print(f"Time taken: {time.time() - start_time:.2f} seconds, or {(time.time() - start_time) / 60:.2f} minutes, or {(time.time() - start_time) / 3600:.2f} hours.\a")
only thing I don't understand is why when I do inference the code automatically prints to the screen. This will slow down my code a lot. How do I stop it?
still does it even if one removes the completions statement:
(AI4Lean_unsloth) root@miranebr-math-p4de-math-test-eval:~# python ~/AI4Lean/playground/chat_template_qwen2_pg.py
-- Start
--> Running: test_unsloth_inference
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))== Unsloth 2024.9: Fast Qwen2 patching. Transformers = 4.44.2.
\\ /| GPU: NVIDIA A100-SXM4-80GB. Max memory: 79.151 GB. Platform = Linux.
O^O/ \_/ \ Pytorch: 2.4.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\ / Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = True]
"-____-" Free Apache license: http://github.com/unslothai/unsloth
prompt: Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,
text: <|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,<|im_end|>
<|im_start|>assistant
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,<|im_end|>
<|im_start|>assistant
The Fibonacci sequence is defined as follows:
Fibonacci(0) = 0
Fibonacci(1) = 1
Fibonacci(n) = Fibonacci(n-1) + Fibonacci(n-2) for n > 1
So, the sequence starts as:
0, 1,
res=tensor([[151644, 8948, 198, 2610, 525, 1207, 16948, 11, 3465,
553, 54364, 14817, 13, 1446, 525, 264, 10950, 17847,
13, 151645, 198, 151644, 872, 198, 23526, 279, 15801,
26378, 24464, 8500, 369, 220, 16, 3019, 1172, 4486,
25, 220, 16, 11, 220, 16, 11, 220, 17,
11, 220, 18, 11, 220, 20, 11, 220, 23,
11, 151645, 198, 151644, 77091, 198, 785, 79683, 8500,
374, 4512, 438, 11017, 1447, 37, 579, 39345, 7,
15, 8, 284, 220, 15, 198, 37, 579, 39345,
7, 16, 8, 284, 220, 16, 198, 37, 579,
39345, 1445, 8, 284, 79683, 1445, 12, 16, 8,
488, 79683, 1445, 12, 17, 8, 369, 308, 861,
220, 16, 271, 4416, 11, 279, 8500, 8471, 438,
1447, 15, 11, 220, 16, 11, 220]],
device='cuda:0')
completion='<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nContinue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,<|im_end|>\n<|im_start|>assistant\nThe Fibonacci sequence is defined as follows:\n\nFibonacci(0) = 0\nFibonacci(1) = 1\nFibonacci(n) = Fibonacci(n-1) + Fibonacci(n-2) for n > 1\n\nSo, the sequence starts as:\n\n0, 1, '
Time taken: 29.97 seconds, or 0.50 minutes, or 0.01 hours.
ok this stoped the stdout
def test_unsloth_inference(
max_length: int = 8192,
use_4bit: bool = False,
):
""" interence notebook: https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing """
print('--> Running: test_unsloth_inference')
import os
from transformers import TextStreamer
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
# model_name = os.path.expanduser('~/data/runs/09192024_12h35m27s_run/train/checkpoint-820')
# model_name = 'unsloth/Qwen2-1.5B'
model_name = 'Qwen/Qwen2-1.5B'
model: FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_length,
dtype=None, # Auto-detection for Float16/BFloat16
load_in_4bit=use_4bit,
)
# tokenizer = get_chat_template(
# tokenizer,
# chat_template = "Qwen2-1.5B",
# mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
# )
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# messages = [{"role": "human", "value": "Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,"}]
prompt = "Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,"
print('prompt: ', prompt)
messages = [
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": prompt}
]
# inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# st()
print('text: ', text)
inputs = tokenizer([text], return_tensors="pt").to(model.device)
# print(f'{inputs.size()=}')
# text_streamer = TextStreamer(tokenizer)
# st()
# res = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)
res = model.generate(**inputs, max_new_tokens = 64)
print(f'{res=}')
completion: str = tokenizer.decode(res[0])
print(f'{completion=}')
remove HF text streamer
slightly more efficient:
def test_unsloth_inference_efficient(
max_length: int = 8192,
use_4bit: bool = False,
):
""" interence notebook: https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing """
print('--> Running: test_unsloth_inference_efficient')
import os
from transformers import AutoTokenizer
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
model_name = os.path.expanduser('~/data/runs/09192024_12h35m27s_run/train/checkpoint-820')
print(f'{model_name=}')
model: FastLanguageModel
tokenizer: AutoTokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_length,
dtype=None, # Auto-detection for Float16/BFloat16
load_in_4bit=use_4bit,
)
print(f'{type(test_unsloth_inference)=}')
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
prompt = "Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,"
print('prompt: ', prompt)
messages = [
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": prompt}
]
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors='pt'
).to(model.device)
print(f'{inputs=}')
# res = model.generate(**inputs, max_new_tokens=64)
res = model.generate(inputs, max_new_tokens=64)
print(f'{len(res)=}')
completion: str = tokenizer.decode(res[0])
print(f'{completion=}')
no need to return the text, we can just return the tensors immediately and only at the end return the text. I also removed the text streamer that prints to stdout and I dont need that.
I am interested in figuring out how to do batch inferences with unsloth instead than 1 prompt at a time.
For now this is working, though vLLM is not.
TODO: how to extend my code for multiple generations, right now I've only made it work for 1 as seen by res[0]
def test_unsloth_inference_efficient(
max_length: int = 8192,
use_4bit: bool = False,
):
""" inference notebook: https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing """
print('--> Running: test_unsloth_inference_efficient')
import os
from transformers import AutoTokenizer
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
model_name = os.path.expanduser('~/data/runs/09192024_12h35m27s_run/train/checkpoint-820')
print(f'{model_name=}')
model: FastLanguageModel
tokenizer: AutoTokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_length,
dtype=None, # Auto-detection for Float16/BFloat16
load_in_4bit=use_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
prompt = "Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,"
print('prompt: ', prompt)
messages = [
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": prompt}
]
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors='pt'
).to(model.device)
print(f'{inputs=}')
# res = model.generate(**inputs, max_new_tokens=64)
res = model.generate(inputs, max_new_tokens=64)
print(f'{len(res)=}')
completion: str = tokenizer.decode(res[0])
print(f'{completion=}')
@brando90 So sorry on the delay!! On the Qwen 2.5 chat template - you can use qwen2.5
vLLM I think I answered in the other thread
For batch inference, you need to use a list of strings
I get this error:
From this code:
what is the right way to chat template Qwen 2 in unsloth?