speculative decoding performance

I've tested speculative decoding feature using llama3 models; I convert draft/target model to trt engine, and launch triton server with bls model, but there seems no performance gain.

environment settings: version: v0.10.0 hardware: 4xA100 80G target model: Meta-Llama-3-70B-Instruct draft model: Meta-Llama-3-8B-Instruct

Following is test steps:

convert draft model to trt engine

python convert_checkpoint.py \
--model_dir /models/Meta-Llama-3-8B-Instruct/ \
--output_dir ./llama3-8b-ckpt \
--dtype float16 \
--tp_size 4
trtllm-build \
--checkpoint_dir ./llama3-8b-ckpt \
--output_dir ./llama3-8b-engine/ \
--gemm_plugin float16 \
--gpt_attention_plugin float16 \
--paged_kv_cache enable \
--remove_input_padding enable \
--use_custom_all_reduce enable \
--max_batch_size 32 \
--max_input_len 2000 \
--max_beam_width 1 \
--tp_size 4 \
--use_fused_mlp \
--use_paged_context_fmha enable \
--tokens_per_block 32 \
--cluster_key A100-SXM-80GB \

then copy llama3-8b-engine/* to triton repos

convert target model to trt engine

python convert_checkpoint.py \
--model_dir /models/Meta-Llama-3-70B-Instruct/ \
--output_dir ./llama3-70b-ckpt \
--dtype float16 \
--tp_size 4
trtllm-build \
--checkpoint_dir ./llama3-70b-ckpt \
--output_dir ./llama3-70b-engine/ \
--gemm_plugin float16 \
--gpt_attention_plugin float16 \
--paged_kv_cache enable \
--remove_input_padding enable \
--use_custom_all_reduce enable \
--max_batch_size 32 \
--max_input_len 2000 \
--max_output_len 500 \
--max_beam_width 1 \
--tp_size 4 \
--use_fused_mlp \
--use_paged_context_fmha enable \
--tokens_per_block 32 \
--cluster_key A100-SXM-80GB \
--max_draft_len 5 \
--speculative_decoding_mode draft_tokens_external

then copy llama3-8b-engine/* to triton repos

setup triton repos config: config.pbtxt is very long, input/output fields are ignored. 3.1 preprocessing

name: "preprocessing"
backend: "python"
max_batch_size: 32
parameters {
key: "tokenizer_dir"
value: {
string_value: "/models/Meta-Llama-3-70B-Instruct/"
}
}
parameters {
key: "add_special_tokens"
value: {
string_value: "true"
}
}
instance_group [
{
    count: 8
    kind: KIND_CPU
}
]

3.2 postprocessing

name: "postprocessing"
backend: "python"
max_batch_size: 32
parameters {
  key: "tokenizer_dir"
  value: {
    string_value: "/models/Meta-Llama-3-70B-Instruct/"
  }
}
parameters {
  key: "skip_special_tokens"
  value: {
    string_value: "True"
  }
}
instance_group [
    {
        count: 8
        kind: KIND_CPU
    }
]

name: "draft"
backend: "tensorrtllm"
max_batch_size: 32
model_transaction_policy {
  decoupled: False
}
dynamic_batching {
    preferred_batch_size: [ 32 ]
    max_queue_delay_microseconds: 10000
}

instance_group [
  {
    count: 1
    kind : KIND_CPU
  }
]
parameters: {
  key: "max_beam_width"
  value: {
    string_value: "1"
  }
}
parameters: {
  key: "FORCE_CPU_ONLY_INPUT_TENSORS"
  value: {
    string_value: "no"
  }
}
parameters: {
  key: "gpt_model_type"
  value: {
    string_value: "inflight_fused_batching"
  }
}
parameters: {
  key: "gpt_model_path"
  value: {
    string_value: "/v0.10/triton_repos/sps/draft/1"
  }
}
parameters: {
  key: "batch_scheduler_policy"
  value: {
    string_value: "max_utilization"
  }
}
parameters: {
  key: "kv_cache_free_gpu_mem_fraction"
  value: {
    string_value: "0.2"
  }
}
parameters: {
  key: "exclude_input_in_output"
  value: {
    string_value: "false"
  }
}
parameters: {
  key: "enable_kv_cache_reuse"
  value: {
    string_value: "true"
  }
}
parameters: {
  key: "enable_chunked_context"
  value: {
    string_value: "false"
  }
}
parameters: {
  key: "decoding_mode"
  value: {
    string_value: "top_k_top_p"
  }
}
parameters: {
  key: "worker_path"
  value: {
    string_value: "/opt/tritonserver/backends/tensorrtllm/triton_tensorrtllm_worker"
  }
}

3.4 target similar to draft model.

name: "target"
backend: "tensorrtllm"
max_batch_size: 32
model_transaction_policy {
  decoupled: False
}
dynamic_batching {
    preferred_batch_size: [ 32 ]
    max_queue_delay_microseconds: 10000
}

instance_group [
  {
    count: 1
    kind : KIND_CPU
  }
]
parameters: {
  key: "max_beam_width"
  value: {
    string_value: "1"
  }
}
parameters: {
  key: "FORCE_CPU_ONLY_INPUT_TENSORS"
  value: {
    string_value: "no"
  }
}
parameters: {
  key: "gpt_model_type"
  value: {
    string_value: "inflight_fused_batching"
  }
}
parameters: {
  key: "gpt_model_path"
  value: {
    string_value: "/v0.10/triton_repos/sps/target/1"
  }
}
parameters: {
  key: "batch_scheduler_policy"
  value: {
    string_value: "max_utilization"
  }
}
parameters: {
  key: "kv_cache_free_gpu_mem_fraction"
  value: {
    string_value: "0.5"
  }
}
parameters: {
  key: "exclude_input_in_output"
  value: {
    string_value: "false"
  }
}
parameters: {
  key: "enable_kv_cache_reuse"
  value: {
    string_value: "true"
  }
}
parameters: {
  key: "enable_chunked_context"
  value: {
    string_value: "false"
  }
}
parameters: {
  key: "decoding_mode"
  value: {
    string_value: "top_k_top_p"
  }
}
parameters: {
  key: "worker_path"
  value: {
    string_value: "/opt/tritonserver/backends/tensorrtllm/triton_tensorrtllm_worker"
  }
}

3.5 ensemble chain preprocessing/target/postprocessing model together. only change tensorrt_llm model to target in config.pbtxt

name: "sps"
backend: "python"
max_batch_size: 32
model_transaction_policy {
  decoupled: False
}

parameters: {
  key: "accumulate_tokens"
  value: {
    string_value: "false"
  }
}
parameters: {
  key: "tensorrt_llm_model_name"
  value: {
    string_value: "target"
  }
}
parameters: {
  key: "tensorrt_llm_draft_model_name"
  value: {
    string_value: "draft"
  }
}
instance_group [
  {
    count: 32
    kind : KIND_CPU
  }
]

lauch triton server:

cd /v0.10/tensorrtllm_backend
python3 scripts/launch_triton_server.py \
--world_size=4 \
--tensorrt_llm_model_name target,draft \
--model_repo=/v0.10/triton_repos/sps/ \

The log shows all 6 models load successfully

test

curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "Who is Issac Newton?", "max_tokens": 100, "bad_words": "", "stop_words": ""}'
curl -X POST localhost:8000/v2/models/sps/generate -d '{"text_input": "Who is Issac Newton?", "max_tokens": 100, "bad_words": "", "stop_words": ""}'

output: {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"Who is Issac Newton? Isaac Newton was a renowned English mathematician, physicist, astronomer, and philosopher who is widely recognized as one of the most influential scientists in history. He made major contributions to the fields of mathematics, optics, and physics, and his work laid the foundation for the Scientific Revolution of the 17th century.\n\nNewton was born on January 4, 1643, in Woolsthorpe, Lincolnshire, England, to a family of modest means. His father, a yeoman farmer,"}

so the sps procedure goes fine.

test performance


from transformers import AutoTokenizer
import requests
import json, time, os
import threading

Concurrency = 8 Count = 100 OutputLen = 150

model_id = "/models/Meta-Llama-3-70B-Instruct/" triton_base = "http://127.0.0.1:8000" model_name = "ensemble" model_name = "sps" url = '{}/v2/models/{}/generate'.format(triton_base, model_name) tokenizer = AutoTokenizer.from_pretrained(model_id)

demo_prompt = """ Evaluate the following articles from various aspects such as writing, style, etc.\n\nPRIDE & PREJUDICE.CHAPTER I.It is a truth universally acknowledged, that a single man in possessionof a good fortune, must be in want of a wife.However little known the feelings or views of such a man may be on hisfirst entering a neighbourhood, this truth is so well fixed in the mindsof the surrounding families, that he is considered as the rightfulproperty of some one or other of their daughters.My dear Mr. Bennet, said his lady to him one day, have you heard thatNetherfield Park is let at last?Mr. Bennet replied that he had not.But it is, returned she; for Mrs. Long has just been here, and shetold me all about it.Mr. Bennet made no answer.Do not you want to know who has taken it? cried his wife impatiently.You want to tell me, and I have no objection to hearing it.This was invitation enough.Why, my dear, you must know, Mrs. Long says that Netherfield is takenby a young man of large fortune from the north of England; that he camedown on Monday in a chaise and four to see the place, and was so muchdelighted with it that he agreed with Mr. Morris immediately; that he isto take possession before Michaelmas, and some of his servants are to bein the house by the end of next week.What is his name?Bingley.Is he married or single?Oh! single, my dear, to be sure! A single man of large fortune; fouror five thousand a year. What a fine thing for our girls!How so? how can it affect them?My dear Mr. Bennet, replied his wife, how can you be so tiresome! Youmust know that I am thinking of his marrying one of them.Is that his design in settling here?Design! nonsense, how can you talk so! But it is very likely that hemay fall in love with one of them, and therefore you must visit him assoon as he comes.I see no occasion for that. You and the girls may go, or you may sendthem by themselves, which perhaps will be still better, for as you areas handsome as any of them, Mr. Bingley might like you the best of theparty.My dear, you flatter me. I certainly have had my share of beauty, butI do not pretend to be any thing extraordinary now. When a woman hasfive grown up daughters, she ought to give over thinking of her ownbeauty.In such cases, a woman has not often much beauty to think of.But, my dear, you must indeed go and see Mr. Bingley when he comes intothe neighbourhood.It is more than I engage for, I assure you.But consider your daughters. Only think what an establishment it wouldbe for one of them. Sir William and Lady Lucas are determined to go,merely on that account, for in general you know they visit no newcomers. Indeed you must go, for it will be impossible for us to visithim, if you do not.You are over scrupulous surely. I dare say Mr. Bingley will be veryglad to see you; and I will send a few lines by you to assure him of myhearty consent to his marrying which ever he chuses of the girls; thoughI must throw in a good word for my little Lizzy.I desire you will do no such thing. Lizzy is not a bit better than theothers; and I am sure she is not half so handsome as Jane, nor half sogood humoured as Lydia. But you are always giving her thepreference.They have none of them much to recommend them, replied he; they areall silly and ignorant like other girls; but Lizzy has something more ofquickness than her sisters.Mr. Bennet, how can you abuse your own children in such a way? You takedelight in vexing me. You have no compassion on my poor nerves.You mistake me, my dear. I have a high respect for your nerves. Theyare my old friends. I have heard you mention them with considerationthese twenty years at least.Ah! you do not know what I suffer.But I hope you will get over it, and live to see many young men of fourthousand a year come into the neighbourhood.It will be no use to us, if twenty such should come since you will notvisit them.Depend upon it, my dear, that when there are twenty, I will visit themall.Mr. Bennet was so odd a mixture of quick parts, sarcastic humour,reserve, and caprice, that the experience of three and twenty years hadbeen insufficient to make his wife understand his character. Her mindwas less difficult to develope. She was a woman of mean understanding,little information, and uncertain temper. When she was discontented shefancied herself nervous. The business of her life was to get herdaughters married; its solace was visiting and news.CHAPTER II.Mr. Bennet was among the earliest of those who waited on Mr. Bingley. Hehad always intended to visit him, though to the last always assuring hiswife that he should not go; and till the evening after the visit waspaid, she had no knowledge of it. It was then disclosed in the followingmanner. Observing his second daughter employed in trimming a hat, hesuddenly addressed her with,I hope Mr. Bingley will like it Lizzy.We are not in a way to know what Mr. Bingley likes, said her motherresentfully, since we are not to visit.But you forget, mama, said Elizabeth, that we shall meet him at theassemblies, and that Mrs. Long has promised to introduce him.I do not believe Mrs. Long will do any such thing. She has two niecesof her own. She is a selfish, hypocritical woman, and I have no opinionof her.No more have I, said Mr. Bennet; and I am glad to find that you donot depend on her serving you.Mrs. Bennet deigned not to make any reply; but unable to containherself, began scolding one of her daughters.Don't keep coughing so, Kitty, for heaven's sake! Have a littlecompassion on my nerves. You tear them to pieces.Kitty has no discretion in her coughs, said her father; she timesthem ill.I do not cough for my own amusement, replied Kitty fretfully.When is your next ball to be, Lizzy?To-morrow fortnight.Aye, so it is, cried her mother, and Mrs. Long does not come backtill the day before; so, it will be impossible for her to introduce him,for she will not know him herself.Then, my dear, you may have the advantage of your friend, and introduceMr. Bingley to her.Impossible, Mr. Bennet, impossible, when I am not acquainted with himmyself; how can you be so teazing?I honour your circumspection. A fortnight's acquaintance is certainlyvery little. One cannot know what a man really is by the end of afortnight. But if we do not venture, somebody else will; and afterall, Mrs. Long and her nieces must stand their chance; and therefore, asshe will think it an act of kindness, if you decline the office, I willtake it on myself.The girls stared at their father. Mrs. Bennet said only, Nonsense,nonsense!What can be the meaning of that emphatic exclamation? cried he. """

input_id_len = len(tokenizer.encode(demo_prompt))

lock = threading.Lock()

class StressStat: def init(self): self.total_cnt = 0 self.succ_cnt = 0 self.fail_cnt = 0 self.prompt_tokens = 0 self.generate_tokens = 0 self.rt_list = []

def record_once(self, succ: bool, prompt_len: int, generate_len: int, rt: float):
    with lock:
        self.total_cnt += 1
        if not succ:
            self.fail_cnt += 1
        else:
            self.succ_cnt += 1
            self.prompt_tokens += prompt_len
            self.generate_tokens += generate_len
            self.rt_list.append(rt)

def show_stat(self, total_time: float):
    print(f"total time: %.2f" % total_time)
    print(f"total cnt: {self.total_cnt}, succ cnt: {self.succ_cnt}, fail cnt: {self.fail_cnt}")
    rps = self.succ_cnt / total_time
    print("rps: %.2f"%(rps))
    input_tps = self.prompt_tokens / total_time
    print("input tps: %.1f"%(input_tps))
    output_tps = self.generate_tokens / total_time
    print("output tps: %.1f"%(output_tps))
    self.rt_list.sort()
    print("p50: %.2f" % (self.rt_list[int(len(self.rt_list) * 0.5)]))
    print("p99: %.2f" % (self.rt_list[int(len(self.rt_list) * 0.99)]))

stress_stat = StressStat()

def build_req() -> dict: req = { "text_input": demo_prompt, "max_tokens": OutputLen, } return req

def infer_once(req: dict): start_ts = time.time() resp :requests.Response = requests.post(url, json=req) elapse = time.time() - start_ts if resp.status_code != 200: return False, 0, 0 resp_obj = json.loads(resp.content) output_len = len(tokenizer.encode(resp_obj["text_output"])) output_len -= input_id_len return True, output_len, elapse

def worker_task(thread_id): print("thread {} start...".format(thread_id)) req = build_req() for i in range(Count): succ, output_len, elapse = infer_once(req=req) stress_stat.record_once(succ=succ, prompt_len=input_id_len, generate_len=output_len, rt=elapse) if i % (Count/10) == 0: print(i) print("thread {} done".format(thread_id))

def main(): threads = [] num_workers = Concurrency start_ts = time.time() for i in range(num_workers): thread = threading.Thread(target=worker_task, args=(i,)) threads.append(thread) thread.start() for thread in threads: thread.join() total_time = time.time() - start_ts

print('---all workers done---')
stress_stat.show_stat(total_time=total_time)

if name == "main": main()



after running the script, I found the ensemble and sps model show the same performance:
rps: 1.51
input tps: 2414.4
output tps: 221.7
p50: 5.30
p99: 5.33

I'm not sure whether this is as expected, or there are some mistakes in my setting.
Pls help me check the perf issue, many thanks!

triton-inference-server / tensorrtllm_backend

speculative decoding performance #498