Closed KiwiHana closed 10 months ago
model = AutoModelForCausalLM.from_pretrained(model_path,
load_in_4bit=True,
optimize_model=False,
trust_remote_code=True)
model = model.cpu()
model.save_low_bit(model_save_dir)
load_model = AutoModelForCausalLM.load_low_bit(model_save_dir,
optimize_model=False)
load_model = load_model.to('xpu')
The slow speed of generate and TextIteratorStreamer has several reasons:
Maybe try this new script again : )
import os
import torch
import time
import argparse
import numpy as np
import intel_extension_for_pytorch as ipex
from bigdl.llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer,LlamaTokenizer
from transformers import TextStreamer, TextIteratorStreamer
if __name__ == '__main__':
model_path = # your model path
print(model_path)
device = "xpu"
time_start = time.time()
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True, trust_remote_code=True, optimize_model=False)
model = model.to('xpu')
tokenizer = LlamaTokenizer.from_pretrained(model_path)
time_model_end = time.time()
print("load model Done",time_model_end - time_start)
# warmup first
prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
with torch.inference_mode():
torch.xpu.synchronize()
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
print("input length is: ", len(input_ids[0]))
time_start_out = time.time()
output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
output_str = tokenizer.decode(output[0], skip_special_tokens=True)
torch.xpu.synchronize()
print("output: ", output_str)
time_end_out = time.time()
print("total sentence time",time_end_out - time_start_out)
if 1:
inputs = tokenizer(prompt, return_tensors="pt").to('xpu')
response = ""
timeStart = time.time()
timeFirst = 0
timeFirstRecord = False
# streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# out = model.generate(**inputs, streamer=streamer, temperature=0.9, top_p=0.9, max_new_tokens=32)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
**inputs,
streamer=streamer,
temperature=0.9, top_p=0.9, max_new_tokens=32
)
# to ensure non-blocking access to the generated text, generation process should be ran in a separate thread
from threading import Thread
thread = Thread(target=model.generate, kwargs=generate_kwargs)
thread.start()
output_str = []
print("Response: ", end="")
for stream_output in streamer:
output_str.append(stream_output)
print(stream_output, end="")
if timeFirstRecord == False:
timeFirst = time.time() - timeStart
timeFirstRecord = True
torch.xpu.synchronize()
timeCost = time.time() - timeStart
print("==========time==============")
print(timeFirst, (timeCost-timeFirst) / 31)
My log is:
load model Done 3.7374014854431152
input length is: 32
output: Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. But her parents were always telling her to stay close to home, to be careful, and to not go anywhere alone.
One day, the little
total sentence time 1.149233102798462
Response: . But her parents were always telling her to stay close to home, to be careful, and to not go anywhere alone.
One day, the little0.21254897117614746 0.030483153558546496
and my version is
bigdl-core-xe 2.4.0b20230824
bigdl-llm 2.4.0b20230824
Thanks a lot🙂 model = model.half().to('xpu') bigdl-core-xe 2.4.0b20230824 bigdl-llm 2.4.0b20230824 0.356173038482666 0.06664117690055602
Question 1:Loading time of FP16 model is too long ~2min. Can I save transformer INT4 model advanced? Question 2: Why stream TextIteratorStreamer has long 1st token 2.16s? Can 1st latency be lower than 0.5s?
model | Transformer INT4 | In/out tokens | 1st token | 2nd + Avg | VRAM | mem | Load model time Llama2 | 吐字 Streamer _ | 32/32 ___ | 2.16s __| 69ms/token __| 4.5GB | 2.2GB | 118.2s Llama2 | 整句同时输出 | 32/32__ | 2.86s __| 4.5GB | 2.2GB | 118.2s