Clm Prompt tuning inference based on tutorial throwing runtime error

LaviRoars commented 8 months ago

System Info

accelerate==0.26.1, transformers==4.37.0, peft==0.7.1, torch==2.1.2, sagemaker notebook

Who can help?

Hi, I was running the promp tuning tutorial from hugginface shown in this link https://huggingface.co/docs/peft/task_guides/clm-prompt-tuning#train and when it got to the inference part, model.generate() was throwing a runtime error about mismatch tensor shape as shown in the traceback below. I basically just copy and pasted the codes in the tutorial with no modification and listed the versions of the packages above.

RuntimeError Traceback (most recent call last) Cell In[23], line 5 3 with torch.no_grad(): 4 inputs = {k: v.to(device) for k, v in inputs.items()} ----> 5 outputs = model.generate( 6 input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3 7 ) 8 print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)) 9 [ 10 "Tweet text : @nationalgridus I have no water and the bill is current and paid. Can you do something about this? Label : complaint" 11 ]

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/peft/peft_model.py:1130, in PeftModelForCausalLM.generate(self, kwargs) 1128 self.base_model.generation_config = self.generation_config 1129 try: -> 1130 outputs = self.base_model.generate(kwargs) 1131 except: 1132 self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, *kwargs): 114 with ctx_factory(): --> 115 return func(args, kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/generation/utils.py:1474, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, kwargs) 1457 return self.assisted_decoding( 1458 input_ids, 1459 candidate_generator=candidate_generator, (...) 1470 model_kwargs, 1471 ) 1472 if generation_mode == GenerationMode.GREEDY_SEARCH: 1473 # 11. run greedy search -> 1474 return self.greedy_search( 1475 input_ids, 1476 logits_processor=prepared_logits_processor, 1477 stopping_criteria=prepared_stopping_criteria, 1478 pad_token_id=generation_config.pad_token_id, 1479 eos_token_id=generation_config.eos_token_id, 1480 output_scores=generation_config.output_scores, 1481 return_dict_in_generate=generation_config.return_dict_in_generate, 1482 synced_gpus=synced_gpus, 1483 streamer=streamer, 1484 **model_kwargs, 1485 ) 1487 elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH: 1488 if not model_kwargs["use_cache"]:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/generation/utils.py:2335, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, model_kwargs) 2332 model_inputs = self.prepare_inputs_for_generation(input_ids, model_kwargs) 2334 # forward pass to get next token -> 2335 outputs = self( 2336 **model_inputs, 2337 return_dict=True, 2338 output_attentions=output_attentions, 2339 output_hidden_states=output_hidden_states, 2340 ) 2342 if synced_gpus and this_peer_finished: 2343 continue # don't waste resources running the code we don't need

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, kwargs) 1516 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(args, kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, *kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(args, **kwargs) 1529 try: 1530 result = None

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/bloom/modeling_bloom.py:858, in BloomForCausalLM.forward(self, input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **deprecated_arguments) 854 raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") 856 return_dict = return_dict if return_dict is not None else self.config.use_return_dict --> 858 transformer_outputs = self.transformer( 859 input_ids, 860 past_key_values=past_key_values, 861 attention_mask=attention_mask, 862 head_mask=head_mask, 863 inputs_embeds=inputs_embeds, 864 use_cache=use_cache, 865 output_attentions=output_attentions, 866 output_hidden_states=output_hidden_states, 867 return_dict=return_dict, 868 ) 869 hidden_states = transformer_outputs[0] 871 lm_logits = self.lm_head(hidden_states)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, kwargs) 1516 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(args, kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, *kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(args, **kwargs) 1529 try: 1530 result = None

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/bloom/modeling_bloom.py:722, in BloomModel.forward(self, input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, **deprecated_arguments) 711 outputs = self._gradient_checkpointing_func( 712 block.call, 713 hidden_states, (...) 719 output_attentions, 720 ) 721 else: --> 722 outputs = block( 723 hidden_states, 724 layer_past=layer_past, 725 attention_mask=causal_mask, 726 head_mask=head_mask[i], 727 use_cache=use_cache, 728 output_attentions=output_attentions, 729 alibi=alibi, 730 ) 732 hidden_states = outputs[0] 733 if use_cache is True:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, kwargs) 1516 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(args, kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, *kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(args, **kwargs) 1529 try: 1530 result = None

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/bloom/modeling_bloom.py:410, in BloomBlock.forward(self, hidden_states, alibi, attention_mask, layer_past, head_mask, use_cache, output_attentions) 407 residual = hidden_states 409 # Self attention. --> 410 attn_outputs = self.self_attention( 411 layernorm_output, 412 residual, 413 layer_past=layer_past, 414 attention_mask=attention_mask, 415 alibi=alibi, 416 head_mask=head_mask, 417 use_cache=use_cache, 418 output_attentions=output_attentions, 419 ) 421 attention_output = attn_outputs[0] 423 outputs = attn_outputs[1:]

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, kwargs) 1516 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(args, kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, *kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(args, **kwargs) 1529 try: 1530 result = None

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/bloom/modeling_bloom.py:288, in BloomAttention.forward(self, hidden_states, residual, alibi, attention_mask, layer_past, head_mask, use_cache, output_attentions) 284 present = None 286 # [batch_size * num_heads, q_length, kv_length] 287 # we use torch.Tensor.baddbmm instead of torch.baddbmm as the latter isn't supported by TorchScript v1.11 --> 288 matmul_result = alibi.baddbmm( 289 batch1=query_layer, 290 batch2=key_layer, 291 beta=self.beta, 292 alpha=self.inv_norm_factor, 293 ) 295 # change view to [batch_size, num_heads, q_length, kv_length] 296 attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length)

RuntimeError: The expanded size of the tensor (38) must match the existing size (94) at non-singleton dimension 2. Target sizes: [16, 1, 38]. Tensor sizes: [16, 1, 94]

Information

[X] The official example scripts
[ ] My own modified scripts

Tasks

[X] An officially supported task in the examples folder
[ ] My own task or dataset (give details below)

Reproduction

model.to(device)

with torch.no_grad(): inputs = {k: v.to(device) for k, v in inputs.items()} outputs = model.generate( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3 ) print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

Expected behavior

It should produce the label prediction as such: "Tweet text : @nationalgridus I have no water and the bill is current and paid. Can you do something about this? Label : complaint"

vikram71198 commented 8 months ago

Check #1391. It's an open issue. Switch use_cache = False to get rid of the error for now.

BenjaminBossan commented 8 months ago

Hi, I tried to reproduce the issue by running https://huggingface.co/docs/peft/main/en/task_guides/clm-prompt-tuning with a few tiny changes (training only 1 epoch, saving and loading the model locally instead of on HF Hub) and could not reproduce the error. I use the latest version of PEFT (0.8.2), as well as transformers==4.37.2. Could you please check if you see the same outcome?

Here is the complete code:

from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType, PeftModel
import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm import tqdm

device = "cuda"
model_name_or_path = "bigscience/bloomz-560m"
tokenizer_name_or_path = "bigscience/bloomz-560m"
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
    tokenizer_name_or_path=model_name_or_path,
)

dataset_name = "twitter_complaints"
checkpoint_name = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace(
    "/", "_"
)
text_column = "Tweet text"
label_column = "text_label"
max_length = 64
lr = 3e-2
num_epochs = 1  # CHANGED
batch_size = 8

dataset = load_dataset("ought/raft", dataset_name)
classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names]
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=1,
)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])

def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

model.save_pretrained("/tmp/issue-1390")
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
loaded = PeftModel.from_pretrained(model, "/tmp/issue-1390").to(device)

inputs = tokenizer(
    f'{text_column} : {"@nationalgridus I have no water and the bill is current and paid. Can you do something about this?"} Label : ',
    return_tensors="pt",
)
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = loaded.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3
    )
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

# prints: ['Tweet text : @nationalgridus I have no water and the bill is current and paid. Can you do something about this? Label : complaint<?php\n/**\n *']

github-actions[bot] commented 7 months ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

HeegonJin commented 7 months ago

This issue is still valid

huggingface / peft