llama2-13b transformer INT4 on Arc A770 Ubuntu22.04 has long 1st latency and long loading time

Question 1：Loading time of FP16 model is too long ~2min. Can I save transformer INT4 model advanced? Question 2: Why stream TextIteratorStreamer has long 1st token 2.16s? Can 1st latency be lower than 0.5s?

model | Transformer INT4 | In/out tokens | 1st token | 2nd + Avg | VRAM | mem | Load model time Llama2 | 吐字 Streamer _ | 32/32 ___ | 2.16s __| 69ms/token __| 4.5GB | 2.2GB | 118.2s Llama2 | 整句同时输出 | 32/32__ | 2.86s __| 4.5GB | 2.2GB | 118.2s

import os
import torch
import time
import argparse
import numpy as np

import intel_extension_for_pytorch as ipex
from bigdl.llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer,LlamaTokenizer

from transformers import TextStreamer,TextIteratorStreamer

if __name__ == '__main__':
    model_path = "/opt/WD/ruonan/Llama-2-7b-chat-hf"
    print(model_path)
    device = "xpu"

    time_start = time.time()
    model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
    model = model.half().to('xpu')
    tokenizer = LlamaTokenizer.from_pretrained(model_path)
    time_model_end = time.time()

    print("load model Done",time_model_end - time_start)

    with torch.inference_mode():
        torch.xpu.synchronize()

        prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')

        time_start_out = time.time()

        output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
        torch.xpu.synchronize()       
        print("output: ", output_str)

        time_end_out = time.time()

        print("total sentence time",time_end_out-time_start_out)

        if 1:
            inputs = tokenizer(prompt, return_tensors="pt").to('xpu')
            streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
            response = ""
            timeStart = time.time() 

            timeFirst = 0
            timeFirstRecord = False

            out = model.generate(**inputs, streamer=streamer,temperature=0.9,top_p=0.9,max_new_tokens=32)   #### 
            torch.xpu.synchronize()
            for new_text in streamer:
                response += new_text
                print(response)
                if timeFirstRecord == False:
                    timeFirst = time.time() - timeStart
                    timeFirstRecord = True

            torch.xpu.synchronize()
            timeCost = time.time() - timeStart
            print(timeFirst,timeCost/31)

Answer for question1

The long loading time of model is just because loading speed of "/opt/WD/" is too slow... If you move the llama2-7b model file to "~/", then the loading time will be normal.
You can save transformer INT4 model advanced and load it then, usage looks like:
```
model = AutoModelForCausalLM.from_pretrained(model_path,
                                         load_in_4bit=True,
                                         optimize_model=False,
                                         trust_remote_code=True)
model = model.cpu()
model.save_low_bit(model_save_dir)
load_model = AutoModelForCausalLM.load_low_bit(model_save_dir,
                                           optimize_model=False)
load_model = load_model.to('xpu')
```
Answer for question2

The slow speed of generate and TextIteratorStreamer has several reasons:
- arc int4 model needs a warmup generate, then generate speed will be normal, I will update our examples for this part
- If you don't care VRAM, you can remove 'half()' and just use fp32 model, it will bring faster speed
- The usage of TextIteratorStreamer is a little wrong, so the statistical time is not accurate

Maybe try this new script again : )

import os
import torch
import time
import argparse
import numpy as np

import intel_extension_for_pytorch as ipex
from bigdl.llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer,LlamaTokenizer

from transformers import TextStreamer, TextIteratorStreamer

if __name__ == '__main__':
    model_path = # your model path
    print(model_path)
    device = "xpu"

    time_start = time.time()
    model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
    model = model.to('xpu')
    tokenizer = LlamaTokenizer.from_pretrained(model_path)
    time_model_end = time.time()

    print("load model Done",time_model_end - time_start)

    # warmup first
    prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
    output = model.generate(input_ids, do_sample=False, max_new_tokens=32)

    with torch.inference_mode():
        torch.xpu.synchronize()

        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
        print("input length is: ", len(input_ids[0]))

        time_start_out = time.time()

        output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
        torch.xpu.synchronize()
        print("output: ", output_str)

        time_end_out = time.time()

        print("total sentence time",time_end_out - time_start_out)

        if 1:
            inputs = tokenizer(prompt, return_tensors="pt").to('xpu')
            response = ""
            timeStart = time.time() 

            timeFirst = 0
            timeFirstRecord = False

            # streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
            # out = model.generate(**inputs, streamer=streamer, temperature=0.9, top_p=0.9, max_new_tokens=32)

            streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
            generate_kwargs = dict(
                        **inputs,
                        streamer=streamer,
                        temperature=0.9, top_p=0.9, max_new_tokens=32
                    )

            # to ensure non-blocking access to the generated text, generation process should be ran in a separate thread
            from threading import Thread

            thread = Thread(target=model.generate, kwargs=generate_kwargs)
            thread.start()

            output_str = []
            print("Response: ", end="")
            for stream_output in streamer:
                output_str.append(stream_output)
                print(stream_output, end="")
                if timeFirstRecord == False:
                    timeFirst = time.time() - timeStart
                    timeFirstRecord = True

            torch.xpu.synchronize()
            timeCost = time.time() - timeStart
            print("==========time==============")
            print(timeFirst, (timeCost-timeFirst) / 31)

My log is:

load model Done 3.7374014854431152
input length is:  32
output:  Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. But her parents were always telling her to stay close to home, to be careful, and to not go anywhere alone.

One day, the little
total sentence time 1.149233102798462
Response: . But her parents were always telling her to stay close to home, to be careful, and to not go anywhere alone.

One day, the little0.21254897117614746 0.030483153558546496

and my version is

bigdl-core-xe               2.4.0b20230824
bigdl-llm                   2.4.0b20230824

intel-analytics / ipex-llm

llama2-13b transformer INT4 on Arc A770 Ubuntu22.04 has long 1st latency and long loading time #8804

Answer for question1

Answer for question2