Batching seems to be broken.


from transformers import AutoTokenizer
from optimum.nvidia import AutoModelForCausalLM

MAX_LEN = 8_000
MODEL_PATH = "/workspace/models/dolphin-2.6-mistral-7b"

logging.info(f"Loading model: {MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
model =  AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    use_fp8=True,
    max_prompt_length=8_000,
    max_output_length=12_000,
    max_batch_size=32,
)

...

def create_completions(prompts, max_tokens=None, temperature=None):
    inputs = tokenizer(prompts, padding=True, return_tensors="pt").to("cuda")
    ids = model.generate(**inputs,
                         max_new_tokens=max_tokens,
                         do_sample=True,
                         temperature=temperature,
                         num_beams=5,
                         renormalize_logits=True,
                         pad_token_id=tokenizer.eos_token_id

    )
    texts = tokenizer.batch_decode(ids[0])

print(create_completions(["text 1", "text 2"], temperature=0.4, max_tokens=128))

Throws following:

RuntimeError                              Traceback (most recent call last)
Cell In[41], line 11
      8     break
     10 start_time = time.time()
---> 11 print(create_completions(["text 1", "text 2"], temperature=0.4, max_tokens=128))
     12 end_time = time.time()

Cell In[37], line 12
     11 inputs = tokenizer(prompts, padding=True, return_tensors="pt").to("cuda")
---> 12 ids = model.generate(**inputs,
     13                      max_new_tokens=max_tokens,
     14                      do_sample=True,
     15                      temperature=temperature,
     16                      num_beams=5,
     17                      renormalize_logits=True,
     18                      pad_token_id=tokenizer.eos_token_id
     19 
     20 )
     21 return tokenizer.batch_decode(ids[0])

File /workspace/ftt-notebooks/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    112 @functools.wraps(func)
    113 def decorate_context(*args, **kwargs):
    114     with ctx_factory():
--> 115         return func(*args, **kwargs)

File /workspace/ftt-notebooks/venv/lib/python3.10/site-packages/optimum/nvidia/runtime.py:282, in CausalLM.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
    279 output_ids = trt_outputs.ids.flatten(0, 1)
    281 # For some reason not in line with Transformers in case we finish early with BOS token (missing last BOS token).
--> 282 if total_length - input_length < max_new_tokens:
    283     total_length += 1
    285 return output_ids[:, :total_length], total_length

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

If print what's compared in runtime.py:282:

            print("total_length", total_length)
            print("input_length", input_length)
            print("max_new_tokens", max_new_tokens)
            if total_length - input_length < max_new_tokens:
                total_length += 1

Here is what I get:

total_length tensor([[191],
        [192]], device='cuda:0', dtype=torch.int32)
input_length 276
max_new_tokens 128

I suppose it should be max of lengths? Something like total_length = trt_outputs.lengths.max().item() ?

huggingface / optimum-nvidia

Batching seems to be broken. #114