huggingface / transformers

🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX.
https://huggingface.co/transformers
Apache License 2.0
133.9k stars 26.78k forks source link

LLaMa 3 8B - offloaded_static cache - layer_device_map TypeError #34327

Open mikethebos opened 1 day ago

mikethebos commented 1 day ago

System Info

Transformers Patch release v4.45.2 PyTorch 1.10.1 Python 3.8.0 cuda 11.1 NVIDIA V100

Who can help?

@gante @zucchini-nlp @Rocketknight1

Information

Tasks

Reproduction

Stack trace:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[19], line 1
----> 1 outputs = pipe(
      2     messages,
      3     max_new_tokens=3000,
      4     eos_token_id=terminators,
      5     do_sample=True,
      6     temperature=0.6,
      7     top_p=0.9,
      8     # cache_implementation="static",
      9     cache_implementation="offloaded_static",
     10 )
     11 assistant_response = outputs[0]["generated_text"][-1]["content"]
     12 print(assistant_response)

File python3.8/site-packages/transformers/pipelines/text_generation.py:267, in TextGenerationPipeline.__call__(self, text_inputs, **kwargs)
    262 if isinstance(
    263     text_inputs, (list, tuple, KeyDataset) if is_torch_available() else (list, tuple)
    264 ) and isinstance(text_inputs[0], (list, tuple, dict)):
    265     # We have one or more prompts in list-of-dicts format, so this is chat mode
    266     if isinstance(text_inputs[0], dict):
--> 267         return super().__call__(Chat(text_inputs), **kwargs)
    268     else:
    269         chats = [Chat(chat) for chat in text_inputs]  # 🐈 🐈 🐈

File python3.8/site-packages/transformers/pipelines/base.py:1268, in Pipeline.__call__(self, inputs, num_workers, batch_size, *args, **kwargs)
   1260     return next(
   1261         iter(
   1262             self.get_iterator(
   (...)
   1265         )
   1266     )
   1267 else:
-> 1268     return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)

File python3.8/site-packages/transformers/pipelines/base.py:1275, in Pipeline.run_single(self, inputs, preprocess_params, forward_params, postprocess_params)
   1273 def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
   1274     model_inputs = self.preprocess(inputs, **preprocess_params)
-> 1275     model_outputs = self.forward(model_inputs, **forward_params)
   1276     outputs = self.postprocess(model_outputs, **postprocess_params)
   1277     return outputs

File python3.8/site-packages/transformers/pipelines/base.py:1175, in Pipeline.forward(self, model_inputs, **forward_params)
   1173     with inference_context():
   1174         model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
-> 1175         model_outputs = self._forward(model_inputs, **forward_params)
   1176         model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
   1177 else:

File python3.8/site-packages/transformers/pipelines/text_generation.py:370, in TextGenerationPipeline._forward(self, model_inputs, **generate_kwargs)
    367 if "generation_config" not in generate_kwargs:
    368     generate_kwargs["generation_config"] = self.generation_config
--> 370 generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
    371 out_b = generated_sequence.shape[0]
    372 if self.framework == "pt":

File python3.8/site-packages/torch/autograd/grad_mode.py:28, in _DecoratorContextManager.__call__.<locals>.decorate_context(*args, **kwargs)
     25 @functools.wraps(func)
     26 def decorate_context(*args, **kwargs):
     27     with self.__class__():
---> 28         return func(*args, **kwargs)

File python3.8/site-packages/transformers/generation/utils.py:1921, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
   1915 if (
   1916     inputs_tensor.shape[1] != input_ids_length
   1917     and model_input_name == "inputs_embeds"
   1918     and not self.config.is_encoder_decoder
   1919 ):
   1920     max_cache_length += inputs_tensor.shape[1]
-> 1921 self._prepare_cache_for_generation(
   1922     generation_config, model_kwargs, assistant_model, batch_size, max_cache_length, device
   1923 )
   1925 # 8. determine generation mode
   1926 generation_mode = generation_config.get_generation_mode(assistant_model)

File python3.8/site-packages/transformers/generation/utils.py:1566, in GenerationMixin._prepare_cache_for_generation(self, generation_config, model_kwargs, assistant_model, batch_size, max_cache_length, device)
   1561     if generation_config.cache_implementation == "static" and not self._supports_static_cache:
   1562         raise ValueError(
   1563             "This model does not support `cache_implementation='static'`. Please check the following "
   1564             "issue: https://github.com/huggingface/transformers/issues/28981"
   1565         )
-> 1566     model_kwargs[cache_name] = self._get_cache(
   1567         cache_implementation=generation_config.cache_implementation,
   1568         batch_size=max(generation_config.num_beams, generation_config.num_return_sequences) * batch_size,
   1569         max_cache_len=max_cache_length,
   1570         device=device,
   1571         model_kwargs=model_kwargs,
   1572     )
   1573 elif generation_config.cache_implementation == "quantized":
   1574     if not self._supports_quantized_cache:

File python3.8/site-packages/transformers/generation/utils.py:1476, in GenerationMixin._get_cache(self, cache_implementation, batch_size, max_cache_len, device, model_kwargs)
   1466 layer_device_map = get_layer_device_map(execution_device_map)
   1468 cache_kwargs = {
   1469     "config": self.config.get_text_config(),
   1470     "max_batch_size": batch_size,
   (...)
   1474     "layer_device_map": layer_device_map,
   1475 }
-> 1476 self._cache = cache_cls(**cache_kwargs)
   1477 if requires_cross_attention_cache:
   1478     encoder_kwargs = cache_kwargs.copy()

TypeError: __init__() got an unexpected keyword argument 'layer_device_map'

Code:

from transformers import pipeline
import torch

cuda_dev_id = 2

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipe = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.float16},  # bfloat16 breaks on torch 1.10.1
    device="cuda:" + str(cuda_dev_id)
)

role = """
You are an AI assistant REDACTED.
"""

prompt = """Here is the id: """ + "\n" + str(example_id) + "\n\n" + """Here is the cid: """ + "\n" + example_cid + "\n\n" + """Here is the s: """ + "\n"+ example_s + "\n\n" + """Here is the c: """ + "\n" + example_c

messages = [
    {"role": "system", "content": role},
    {"role": "user", "content": prompt},
]

terminators = [
    pipe.tokenizer.eos_token_id,
    pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipe(
    messages,
    max_new_tokens=3000,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    cache_implementation="offloaded_static",
)
assistant_response = outputs[0]["generated_text"][-1]["content"]
print(assistant_response)

Expected behavior

assistant_response should be a generated response from the LLaMa model.

Rocketknight1 commented 22 hours ago

This looks like it's happening in generate(), so cc @gante! Let me know if you think it's a pipeline issue instead and I'll handle it.

chengchengpei commented 21 hours ago

@Rocketknight1 i guess layer_device_map is missing in offloaded_static cache. i have a WIP pr https://github.com/huggingface/transformers/pull/34330/files

can you review and comment?