Clm Prompt tuning inference based on tutorial throwing runtime error #1390

Closed LaviRoars closed 7 months ago

LaviRoars commented 8 months ago

System Info

accelerate==0.26.1, transformers==4.37.0, peft==0.7.1, torch==2.1.2, sagemaker notebook

Who can help?

Hi, I was running the promp tuning tutorial from hugginface shown in this link and when it got to the inference part, model.generate() was throwing a runtime error about mismatch tensor shape as shown in the traceback below. I basically just copy and pasted the codes in the tutorial with no modification and listed the versions of the packages above.

RuntimeError Traceback (most recent call last) Cell In[23], line 5 3 with torch.no_grad(): 4 inputs = {k: for k, v in inputs.items()} ----> 5 outputs = model.generate( 6 input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3 7 ) 8 print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)) 9 [ 10 "Tweet text : @nationalgridus I have no water and the bill is current and paid. Can you do something about this? Label : complaint" 11 ]

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/peft/, in PeftModelForCausalLM.generate(self, kwargs) 1128 self.base_model.generation_config = self.generation_config 1129 try: -> 1130 outputs = self.base_model.generate(kwargs) 1131 except: 1132 self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/utils/, in context_decorator..decorate_context(*args, kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, *kwargs): 114 with ctx_factory(): --> 115 return func(args, kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/generation/, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, kwargs) 1457 return self.assisted_decoding( 1458 input_ids, 1459 candidate_generator=candidate_generator, (...) 1470 model_kwargs, 1471 ) 1472 if generation_mode == GenerationMode.GREEDY_SEARCH: 1473 # 11. run greedy search -> 1474 return self.greedy_search( 1475 input_ids, 1476 logits_processor=prepared_logits_processor, 1477 stopping_criteria=prepared_stopping_criteria, 1478 pad_token_id=generation_config.pad_token_id, 1479 eos_token_id=generation_config.eos_token_id, 1480 output_scores=generation_config.output_scores, 1481 return_dict_in_generate=generation_config.return_dict_in_generate, 1482 synced_gpus=synced_gpus, 1483 streamer=streamer, 1484 **model_kwargs, 1485 ) 1487 elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH: 1488 if not model_kwargs["use_cache"]:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/generation/, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, model_kwargs) 2332 model_inputs = self.prepare_inputs_for_generation(input_ids, model_kwargs) 2334 # forward pass to get next token -> 2335 outputs = self( 2336 **model_inputs, 2337 return_dict=True, 2338 output_attentions=output_attentions, 2339 output_hidden_states=output_hidden_states, 2340 ) 2342 if synced_gpus and this_peer_finished: 2343 continue # don't waste resources running the code we don't need

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/, in Module._wrapped_call_impl(self, *args, kwargs) 1516 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(args, kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/, in Module._call_impl(self, *args, *kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(args, **kwargs) 1529 try: 1530 result = None

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/bloom/, in BloomForCausalLM.forward(self, input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **deprecated_arguments) 854 raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") 856 return_dict = return_dict if return_dict is not None else self.config.use_return_dict --> 858 transformer_outputs = self.transformer( 859 input_ids, 860 past_key_values=past_key_values, 861 attention_mask=attention_mask, 862 head_mask=head_mask, 863 inputs_embeds=inputs_embeds, 864 use_cache=use_cache, 865 output_attentions=output_attentions, 866 output_hidden_states=output_hidden_states, 867 return_dict=return_dict, 868 ) 869 hidden_states = transformer_outputs[0] 871 lm_logits = self.lm_head(hidden_states)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/, in Module._wrapped_call_impl(self, *args, kwargs) 1516 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(args, kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/, in Module._call_impl(self, *args, *kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(args, **kwargs) 1529 try: 1530 result = None

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/bloom/, in BloomModel.forward(self, input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, **deprecated_arguments) 711 outputs = self._gradient_checkpointing_func( 712, 713 hidden_states, (...) 719 output_attentions, 720 ) 721 else: --> 722 outputs = block( 723 hidden_states, 724 layer_past=layer_past, 725 attention_mask=causal_mask, 726 head_mask=head_mask[i], 727 use_cache=use_cache, 728 output_attentions=output_attentions, 729 alibi=alibi, 730 ) 732 hidden_states = outputs[0] 733 if use_cache is True:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/, in Module._wrapped_call_impl(self, *args, kwargs) 1516 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(args, kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/, in Module._call_impl(self, *args, *kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(args, **kwargs) 1529 try: 1530 result = None

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/bloom/, in BloomBlock.forward(self, hidden_states, alibi, attention_mask, layer_past, head_mask, use_cache, output_attentions) 407 residual = hidden_states 409 # Self attention. --> 410 attn_outputs = self.self_attention( 411 layernorm_output, 412 residual, 413 layer_past=layer_past, 414 attention_mask=attention_mask, 415 alibi=alibi, 416 head_mask=head_mask, 417 use_cache=use_cache, 418 output_attentions=output_attentions, 419 ) 421 attention_output = attn_outputs[0] 423 outputs = attn_outputs[1:]

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/, in Module._wrapped_call_impl(self, *args, kwargs) 1516 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(args, kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/, in Module._call_impl(self, *args, *kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(args, **kwargs) 1529 try: 1530 result = None

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/bloom/, in BloomAttention.forward(self, hidden_states, residual, alibi, attention_mask, layer_past, head_mask, use_cache, output_attentions) 284 present = None 286 # [batch_size * num_heads, q_length, kv_length] 287 # we use torch.Tensor.baddbmm instead of torch.baddbmm as the latter isn't supported by TorchScript v1.11 --> 288 matmul_result = alibi.baddbmm( 289 batch1=query_layer, 290 batch2=key_layer, 291 beta=self.beta, 292 alpha=self.inv_norm_factor, 293 ) 295 # change view to [batch_size, num_heads, q_length, kv_length] 296 attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length)

RuntimeError: The expanded size of the tensor (38) must match the existing size (94) at non-singleton dimension 2. Target sizes: [16, 1, 38]. Tensor sizes: [16, 1, 94]




with torch.no_grad(): inputs = {k: for k, v in inputs.items()} outputs = model.generate( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3 ) print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

Expected behavior

It should produce the label prediction as such: "Tweet text : @nationalgridus I have no water and the bill is current and paid. Can you do something about this? Label : complaint"

vikram71198 commented 8 months ago

Check #1391. It's an open issue. Switch use_cache = False to get rid of the error for now.

BenjaminBossan commented 8 months ago

Hi, I tried to reproduce the issue by running with a few tiny changes (training only 1 epoch, saving and loading the model locally instead of on HF Hub) and could not reproduce the error. I use the latest version of PEFT (0.8.2), as well as transformers==4.37.2. Could you please check if you see the same outcome?

Here is the complete code:

from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType, PeftModel
import torch
from datasets import load_dataset
import os
from import DataLoader
from tqdm import tqdm

device = "cuda"
model_name_or_path = "bigscience/bloomz-560m"
tokenizer_name_or_path = "bigscience/bloomz-560m"
peft_config = PromptTuningConfig(
    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",

dataset_name = "twitter_complaints"
checkpoint_name = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace(
    "/", "_"
text_column = "Tweet text"
label_column = "text_label"
max_length = 64
lr = 3e-2
num_epochs = 1  # CHANGED
batch_size = 8

dataset = load_dataset("ought/raft", dataset_name)
classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names]
dataset =
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])

def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

processed_datasets =
    desc="Running tokenizer on dataset",

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    num_training_steps=(len(train_dataloader) * num_epochs),

model =

for epoch in range(num_epochs):
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()

    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
loaded = PeftModel.from_pretrained(model, "/tmp/issue-1390").to(device)

inputs = tokenizer(
    f'{text_column} : {"@nationalgridus I have no water and the bill is current and paid. Can you do something about this?"} Label : ',
with torch.no_grad():
    inputs = {k: for k, v in inputs.items()}
    outputs = loaded.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

# prints: ['Tweet text : @nationalgridus I have no water and the bill is current and paid. Can you do something about this? Label : complaint<?php\n/**\n *']
github-actions[bot] commented 7 months ago

HeegonJin commented 7 months ago

This issue is still valid