What is the right way to load Qwen2's chat interface?

brando90 commented 1 month ago

I get this error:

    chat_template, stop_word, yes_map_eos_token, ollama_modelfile = CHAT_TEMPLATES[chat_template]
                                                                    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^
KeyError: 'Qwen2-1.5B'

From this code:

def test_unsloth_inference(        
        max_length: int = 8192,
        use_4bit: bool = False,
    ):
    """ interence notebook: https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing """
    from transformers import TextStreamer
    from unsloth.chat_templates import get_chat_template
    model_name = os.path.expanduser('~/data/runs/09192024_12h35m27s_run/train/checkpoint-820')
    model: FastLanguageModel 
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_length,
        dtype=None,  # Auto-detection for Float16/BFloat16
        load_in_4bit=use_4bit,
    )
    tokenizer = get_chat_template(
        tokenizer,
        chat_template = "Qwen2-1.5B",
        mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    messages = [{"from": "human", "value": "Continue the fibonnaci sequence for a 1 step only please: 1, 1, 2, 3, 5, 8,"}]
    inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")
    text_streamer = TextStreamer(tokenizer)
    res = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)
    print(f'{res=}')

what is the right way to chat template Qwen 2 in unsloth?

brando90 commented 1 month ago

https://discord.com/channels/1179035537009545276/1286411100379938931

brando90 commented 1 month ago

maybe this?


unsloth_eos_token = "eos_token"
CHAT_TEMPLATES["unsloth"] = (unsloth_template, unsloth_eos_token, False, unsloth_ollama,)
pass

Trapper4888 commented 1 month ago

Maybe try to change this:

    tokenizer = get_chat_template(
        tokenizer,
        chat_template = "Qwen2-1.5B",
        mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    )

To this:

    custom_template="""{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"""
    custom_eos_token = "eos_token"
    tokenizer = get_chat_template(
        tokenizer,
        chat_template = (custom_template, custom_eos_token),
        mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    )

Since the eos token seems to be hard coded in the template (it might not be a good practice, since it may be tokenized differently), I'm not sur of what to do of the custom_eos_token.

danielhanchen commented 1 month ago

I'll look over this again in the weekend - sorry on the delay!!

brando90 commented 1 month ago

I'll look over this again in the weekend - sorry on the delay!!

let us know how it's going or if you need help! @danielhanchen

brando90 commented 1 month ago

@danielhanchen I think I got it to work with this:

def test_unsloth_inference(        
        max_length: int = 8192,
        use_4bit: bool = False,
    ):
    """ interence notebook: https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing """
    print('--> Running: test_unsloth_inference')
    import os
    from transformers import TextStreamer
    from unsloth import FastLanguageModel
    from unsloth.chat_templates import get_chat_template
    # model_name = os.path.expanduser('~/data/runs/09192024_12h35m27s_run/train/checkpoint-820')
    # model_name = 'unsloth/Qwen2-1.5B'
    model_name = 'Qwen/Qwen2-1.5B'
    model: FastLanguageModel 
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_length,
        dtype=None,  # Auto-detection for Float16/BFloat16
        load_in_4bit=use_4bit,
    )
    # tokenizer = get_chat_template(
    #     tokenizer,
    #     chat_template = "Qwen2-1.5B",
    #     mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    # )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    # messages = [{"role": "human", "value": "Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,"}]
    prompt = "Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,"
    print('prompt: ', prompt)
    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    # inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    # st()
    print('text: ', text)
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    # print(f'{inputs.size()=}')
    text_streamer = TextStreamer(tokenizer)
    # st()
    res = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)
    print(f'{res=}')
    completion: str = tokenizer.decode(res[0])
    print(f'{completion=}')

    if __name__ == "__main__":
    import fire
    import time
    print('\n-- Start')
    start_time = time.time()
    # fire.Fire(test_unsloth_vllm)
    # fire.Fire(test_unsloth_inference)
    # fire.Fire(test_unsloth_plus_hf_inference)
    fire.Fire(test_unsloth_inference)
    print(f"Time taken: {time.time() - start_time:.2f} seconds, or {(time.time() - start_time) / 60:.2f} minutes, or {(time.time() - start_time) / 3600:.2f} hours.\a")

only thing I don't understand is why when I do inference the code automatically prints to the screen. This will slow down my code a lot. How do I stop it?

brando90 commented 1 month ago

ref: https://discord.com/channels/1179035537009545276/1288946827688939591

brando90 commented 1 month ago

still does it even if one removes the completions statement:

(AI4Lean_unsloth) root@miranebr-math-p4de-math-test-eval:~# python ~/AI4Lean/playground/chat_template_qwen2_pg.py

-- Start
--> Running: test_unsloth_inference
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9: Fast Qwen2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-80GB. Max memory: 79.151 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
prompt:  Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,
text:  <|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,<|im_end|>
<|im_start|>assistant

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,<|im_end|>
<|im_start|>assistant
The Fibonacci sequence is defined as follows:

Fibonacci(0) = 0
Fibonacci(1) = 1
Fibonacci(n) = Fibonacci(n-1) + Fibonacci(n-2) for n > 1

So, the sequence starts as:

0, 1, 
res=tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,  23526,    279,  15801,
          26378,  24464,   8500,    369,    220,     16,   3019,   1172,   4486,
             25,    220,     16,     11,    220,     16,     11,    220,     17,
             11,    220,     18,     11,    220,     20,     11,    220,     23,
             11, 151645,    198, 151644,  77091,    198,    785,  79683,   8500,
            374,   4512,    438,  11017,   1447,     37,    579,  39345,      7,
             15,      8,    284,    220,     15,    198,     37,    579,  39345,
              7,     16,      8,    284,    220,     16,    198,     37,    579,
          39345,   1445,      8,    284,  79683,   1445,     12,     16,      8,
            488,  79683,   1445,     12,     17,      8,    369,    308,    861,
            220,     16,    271,   4416,     11,    279,   8500,   8471,    438,
           1447,     15,     11,    220,     16,     11,    220]],
       device='cuda:0')
completion='<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nContinue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,<|im_end|>\n<|im_start|>assistant\nThe Fibonacci sequence is defined as follows:\n\nFibonacci(0) = 0\nFibonacci(1) = 1\nFibonacci(n) = Fibonacci(n-1) + Fibonacci(n-2) for n > 1\n\nSo, the sequence starts as:\n\n0, 1, '
Time taken: 29.97 seconds, or 0.50 minutes, or 0.01 hours.

brando90 commented 1 month ago

ok this stoped the stdout

def test_unsloth_inference(        
        max_length: int = 8192,
        use_4bit: bool = False,
    ):
    """ interence notebook: https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing """
    print('--> Running: test_unsloth_inference')
    import os
    from transformers import TextStreamer
    from unsloth import FastLanguageModel
    from unsloth.chat_templates import get_chat_template
    # model_name = os.path.expanduser('~/data/runs/09192024_12h35m27s_run/train/checkpoint-820')
    # model_name = 'unsloth/Qwen2-1.5B'
    model_name = 'Qwen/Qwen2-1.5B'
    model: FastLanguageModel 
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_length,
        dtype=None,  # Auto-detection for Float16/BFloat16
        load_in_4bit=use_4bit,
    )
    # tokenizer = get_chat_template(
    #     tokenizer,
    #     chat_template = "Qwen2-1.5B",
    #     mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    # )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    # messages = [{"role": "human", "value": "Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,"}]
    prompt = "Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,"
    print('prompt: ', prompt)
    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    # inputs = tokenizer.apply_chat_template(messages, tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    # st()
    print('text: ', text)
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    # print(f'{inputs.size()=}')

    # text_streamer = TextStreamer(tokenizer)
    # st()
    # res = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)
    res = model.generate(**inputs, max_new_tokens = 64)
    print(f'{res=}')
    completion: str = tokenizer.decode(res[0])
    print(f'{completion=}')

brando90 commented 1 month ago

remove HF text streamer

brando90 commented 1 month ago

slightly more efficient:

def test_unsloth_inference_efficient(        
        max_length: int = 8192,
        use_4bit: bool = False,
    ):
    """ interence notebook: https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing """
    print('--> Running: test_unsloth_inference_efficient')
    import os
    from transformers import AutoTokenizer
    from unsloth import FastLanguageModel
    from unsloth.chat_templates import get_chat_template
    model_name = os.path.expanduser('~/data/runs/09192024_12h35m27s_run/train/checkpoint-820')
    print(f'{model_name=}')
    model: FastLanguageModel 
    tokenizer: AutoTokenizer
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_length,
        dtype=None,  # Auto-detection for Float16/BFloat16
        load_in_4bit=use_4bit,
    )
    print(f'{type(test_unsloth_inference)=}')
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    prompt = "Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,"
    print('prompt: ', prompt)
    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors='pt'
    ).to(model.device)
    print(f'{inputs=}')
    # res = model.generate(**inputs, max_new_tokens=64)
    res = model.generate(inputs, max_new_tokens=64)
    print(f'{len(res)=}')
    completion: str = tokenizer.decode(res[0])
    print(f'{completion=}')

no need to return the text, we can just return the tensors immediately and only at the end return the text. I also removed the text streamer that prints to stdout and I dont need that.

I am interested in figuring out how to do batch inferences with unsloth instead than 1 prompt at a time.

brando90 commented 1 month ago

For now this is working, though vLLM is not.

brando90 commented 1 month ago

TODO: how to extend my code for multiple generations, right now I've only made it work for 1 as seen by res[0]

def test_unsloth_inference_efficient(        
        max_length: int = 8192,
        use_4bit: bool = False,
    ):
    """ inference notebook: https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing """
    print('--> Running: test_unsloth_inference_efficient')
    import os
    from transformers import AutoTokenizer
    from unsloth import FastLanguageModel
    from unsloth.chat_templates import get_chat_template
    model_name = os.path.expanduser('~/data/runs/09192024_12h35m27s_run/train/checkpoint-820')
    print(f'{model_name=}')
    model: FastLanguageModel 
    tokenizer: AutoTokenizer
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_length,
        dtype=None,  # Auto-detection for Float16/BFloat16
        load_in_4bit=use_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    prompt = "Continue the fibonnaci sequence for 1 step only please: 1, 1, 2, 3, 5, 8,"
    print('prompt: ', prompt)
    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors='pt'
    ).to(model.device)
    print(f'{inputs=}')
    # res = model.generate(**inputs, max_new_tokens=64)
    res = model.generate(inputs, max_new_tokens=64)
    print(f'{len(res)=}')
    completion: str = tokenizer.decode(res[0])
    print(f'{completion=}')

danielhanchen commented 1 month ago

@brando90 So sorry on the delay!! On the Qwen 2.5 chat template - you can use qwen2.5

vLLM I think I answered in the other thread

For batch inference, you need to use a list of strings

unslothai / unsloth

What is the right way to load Qwen2's chat interface? #1040