Closed zaycev closed 7 months ago
from transformers import AutoTokenizer from optimum.nvidia import AutoModelForCausalLM MAX_LEN = 8_000 MODEL_PATH = "/workspace/models/dolphin-2.6-mistral-7b" logging.info(f"Loading model: {MODEL_PATH}") tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True) model = AutoModelForCausalLM.from_pretrained( MODEL_PATH, use_fp8=True, max_prompt_length=8_000, max_output_length=12_000, max_batch_size=32, ) ... def create_completions(prompts, max_tokens=None, temperature=None): inputs = tokenizer(prompts, padding=True, return_tensors="pt").to("cuda") ids = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, num_beams=5, renormalize_logits=True, pad_token_id=tokenizer.eos_token_id ) texts = tokenizer.batch_decode(ids[0]) print(create_completions(["text 1", "text 2"], temperature=0.4, max_tokens=128))
Throws following:
RuntimeError Traceback (most recent call last) Cell In[41], line 11 8 break 10 start_time = time.time() ---> 11 print(create_completions(["text 1", "text 2"], temperature=0.4, max_tokens=128)) 12 end_time = time.time() Cell In[37], line 12 11 inputs = tokenizer(prompts, padding=True, return_tensors="pt").to("cuda") ---> 12 ids = model.generate(**inputs, 13 max_new_tokens=max_tokens, 14 do_sample=True, 15 temperature=temperature, 16 num_beams=5, 17 renormalize_logits=True, 18 pad_token_id=tokenizer.eos_token_id 19 20 ) 21 return tokenizer.batch_decode(ids[0]) File /workspace/ftt-notebooks/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, **kwargs): 114 with ctx_factory(): --> 115 return func(*args, **kwargs) File /workspace/ftt-notebooks/venv/lib/python3.10/site-packages/optimum/nvidia/runtime.py:282, in CausalLM.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs) 279 output_ids = trt_outputs.ids.flatten(0, 1) 281 # For some reason not in line with Transformers in case we finish early with BOS token (missing last BOS token). --> 282 if total_length - input_length < max_new_tokens: 283 total_length += 1 285 return output_ids[:, :total_length], total_length RuntimeError: Boolean value of Tensor with more than one value is ambiguous
If print what's compared in runtime.py:282:
runtime.py:282
print("total_length", total_length) print("input_length", input_length) print("max_new_tokens", max_new_tokens) if total_length - input_length < max_new_tokens: total_length += 1
Here is what I get:
total_length tensor([[191], [192]], device='cuda:0', dtype=torch.int32) input_length 276 max_new_tokens 128
I suppose it should be max of lengths? Something like total_length = trt_outputs.lengths.max().item() ?
total_length = trt_outputs.lengths.max().item()
Thanks a lot for the fix!
Throws following:
If print what's compared in
runtime.py:282
:Here is what I get:
I suppose it should be max of lengths? Something like
total_length = trt_outputs.lengths.max().item()
?