ELS-RD / transformer-deploy

Efficient, scalable and enterprise-grade CPU/GPU inference server for 🤗 Hugging Face transformer models 🚀
https://els-rd.github.io/transformer-deploy/
Apache License 2.0
1.64k stars 150 forks source link

TypeError: unhashable type: 'slice' #136

Closed pngmafia closed 1 year ago

pngmafia commented 2 years ago
TypeError                                 Traceback (most recent call last)
Cell In [16], line 13
     11 gpt2_model = GPTModelWrapper(config=model.config, device=torch.device("cuda"), inference=inference_tensorrt)
     12 inputs.to("cuda")
---> 13 sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)
     14 print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
     15 for _ in range(2):

File /usr/local/lib/python3.8/dist-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.__call__.<locals>.decorate_context(*args, **kwargs)
     24 @functools.wraps(func)
     25 def decorate_context(*args, **kwargs):
     26     with self.clone():
---> 27         return func(*args, **kwargs)

File /usr/local/lib/python3.8/dist-packages/transformers/generation_utils.py:1294, in GenerationMixin.generate(self, inputs, max_length, min_length, do_sample, early_stopping, num_beams, temperature, top_k, top_p, typical_p, repetition_penalty, bad_words_ids, force_words_ids, bos_token_id, pad_token_id, eos_token_id, length_penalty, no_repeat_ngram_size, encoder_no_repeat_ngram_size, num_return_sequences, max_time, max_new_tokens, decoder_start_token_id, use_cache, num_beam_groups, diversity_penalty, prefix_allowed_tokens_fn, logits_processor, renormalize_logits, stopping_criteria, constraints, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, forced_bos_token_id, forced_eos_token_id, remove_invalid_values, synced_gpus, exponential_decay_length_penalty, **model_kwargs)
   1289         raise ValueError(
   1290             f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search."
   1291         )
   1293     # 10. run greedy search
-> 1294     return self.greedy_search(
   1295         input_ids,
   1296         logits_processor=logits_processor,
   1297         stopping_criteria=stopping_criteria,
   1298         pad_token_id=pad_token_id,
   1299         eos_token_id=eos_token_id,
   1300         output_scores=output_scores,
   1301         return_dict_in_generate=return_dict_in_generate,
   1302         synced_gpus=synced_gpus,
   1303         **model_kwargs,
   1304     )
   1306 elif is_sample_gen_mode:
   1307     # 10. prepare logits warper
   1308     logits_warper = self._get_logits_warper(
   1309         top_k=top_k,
   1310         top_p=top_p,
   (...)
   1314         renormalize_logits=renormalize_logits,
   1315     )

File /usr/local/lib/python3.8/dist-packages/transformers/generation_utils.py:1699, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, **model_kwargs)
   1696     cur_len = cur_len + 1
   1697     continue  # don't waste resources running the code we don't need
-> 1699 next_token_logits = outputs.logits[:, -1, :]
   1701 # pre-process distribution
   1702 next_tokens_scores = logits_processor(input_ids, next_token_logits)

TypeError: unhashable type: 'slice'

Any clue how I could fix this?

pommedeterresautee commented 2 years ago

Can you provide a way to reproduce the exception?

pngmafia commented 2 years ago

I just re ran the cells on the gpt2 notebook and this issue occurs while running the tensorrt inference cell

tensorrt_model: Callable[[Dict[str, torch.Tensor]], torch.Tensor] = load_engine(
    engine_file_path="test-gpt2.plan", runtime=runtime
)

def inference_tensorrt(input_ids: torch.Tensor) -> torch.Tensor:
    data = {"input_ids": input_ids}
    return tensorrt_model(data)

gpt2_model = GPTModelWrapper(config=model.config, device=torch.device("cuda"), inference=inference_tensorrt)
inputs.to("cuda")
sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
for _ in range(2):
    _ = gpt2_model.generate(inputs.input_ids, max_length=64)
start = time.time()
for _ in range(10):
    _ = gpt2_model.generate(inputs.input_ids, max_length=256)
print(f"----\nTensorRT + CUDA tensors: {(time.time() - start)/10:.2f}/sequence")

del tensorrt_model