Closed maxrousseau closed 1 year ago
cc @ArthurZucker
Hello @patrickvonplaten @ArthurZucker,
I wrote a simple test case to reproduce the error I am getting for the model I am trying to implement using a few examples from SQuAD.
from datasets import Dataset
def formatToMI(dataset):
"""take a squad-like qa dataset and transform into MLM format"""
masked_strings = []
full_strings = []
qa_strings = []
answer_strings = []
for i in range(len(dataset["question"])):
question = dataset["question"][i]
answer = dataset["answers"][i]["text"][0]
context = dataset["context"][i]
masked_strings.append(
"Question: {} Answer: <mask>. Context: {}".format(question, context)
)
full_strings.append(
"Question: {} Answer: {}. Context: {}".format(question, answer, context)
)
qa_strings.append("Question: {} Answer: {}.".format(question, answer))
answer_strings.append(answer)
return {
"masked_strings": masked_strings,
"full_strings": full_strings,
"qa_strings": qa_strings,
"answer_strings": answer_strings,
"id": dataset["id"],
}
def loadSquadMI(n=None):
"""create a dataloader for SQuAD"""
from datasets import load_dataset
raw_datasets = load_dataset("squad")
if n is not None:
squad_subset = formatToMI(raw_datasets["train"][:n])
return squad_subset
else:
return 0
samples = loadSquadMI(n=100)
tiny_squad = Dataset.from_dict(samples)
from transformers import AutoTokenizer, BartForConditionalGeneration, DataCollatorForSeq2Seq
import torch
from torch.utils.data import DataLoader
# initialize BART and PrefixBART for MI
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
examples = tiny_squad
prefixbart_model = PrefixBartForConditionalGeneration.from_pretrained("facebook/bart-base")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=prefixbart_model,
label_pad_token_id=-100,
pad_to_multiple_of=8,
)
# preprocessing
def training_preprocessing(examples):
"""examples have all three types of string"""
padding = "max_length"
model_inputs = tokenizer(
examples["masked_strings"],
max_length=384,
padding=padding,
truncation=False,
)
labels = tokenizer(
text_target=examples["qa_strings"],
max_length=128,
padding=padding,
truncation=True,
)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss.
if padding == "max_length":
labels["input_ids"] = [
[(l if l != tokenizer.pad_token_id else -100) for l in label]
for label in labels["input_ids"]
]
model_inputs["labels"] = labels["input_ids"]
return model_inputs
proc_train_dataset = examples.map(
training_preprocessing,
batched=True,
remove_columns=examples.column_names,
)
train_tensor = proc_train_dataset
train_tensor.set_format("torch")
train_dataloader = DataLoader(
train_tensor,
shuffle=True,
collate_fn=data_collator,
batch_size=4,
num_workers=0,
)
bart_model.train()
batch = next(iter(train_dataloader))
outputs = bart_model(**batch)
loss = outputs.loss
print(loss)
Output:
tensor(0.8271, grad_fn=<NllLossBackward0>)
prefixbart_model.train()
batch = next(iter(train_dataloader))
outputs = prefixbart_model(**batch)
loss = outputs.loss
print(loss)
Output
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
[<ipython-input-26-ebc93e8e099a>](https://localhost:8080/#) in <module>
3 prefixbart_model.train()
4 batch = next(iter(train_dataloader))
----> 5 outputs = prefixbart_model(**batch)
6 loss = outputs.loss
7 print(loss)
9 frames
[/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in _call_impl(self, *input, **kwargs)
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
[<ipython-input-5-71e56dfc61a6>](https://localhost:8080/#) in forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, past_key_values, inputs_embeds, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
211 )
212
--> 213 outputs = self.model(
214 input_ids,
215 attention_mask=attention_mask,
[/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in _call_impl(self, *input, **kwargs)
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
[/usr/local/lib/python3.8/dist-packages/transformers/models/bart/modeling_bart.py](https://localhost:8080/#) in forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, past_key_values, inputs_embeds, decoder_inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
1231
1232 if encoder_outputs is None:
-> 1233 encoder_outputs = self.encoder(
1234 input_ids=input_ids,
1235 attention_mask=attention_mask,
[/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in _call_impl(self, *input, **kwargs)
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
[/usr/local/lib/python3.8/dist-packages/transformers/models/bart/modeling_bart.py](https://localhost:8080/#) in forward(self, input_ids, attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict)
848 )
849 else:
--> 850 layer_outputs = encoder_layer(
851 hidden_states,
852 attention_mask,
[/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in _call_impl(self, *input, **kwargs)
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
[/usr/local/lib/python3.8/dist-packages/transformers/models/bart/modeling_bart.py](https://localhost:8080/#) in forward(self, hidden_states, attention_mask, layer_head_mask, output_attentions)
323 """
324 residual = hidden_states
--> 325 hidden_states, attn_weights, _ = self.self_attn(
326 hidden_states=hidden_states,
327 attention_mask=attention_mask,
[/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in _call_impl(self, *input, **kwargs)
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
[/usr/local/lib/python3.8/dist-packages/transformers/models/bart/modeling_bart.py](https://localhost:8080/#) in forward(self, hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask, output_attentions)
238 if attention_mask is not None:
239 if attention_mask.size() != (bsz, 1, tgt_len, src_len):
--> 240 raise ValueError(
241 f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
242 )
ValueError: Attention mask should be of size (4, 1, 384, 384), but is torch.Size([4, 1, 388, 388])
Hello again @patrickvonplaten @ArthurZucker,
I just found out about adapter-transformers
which implements prefix-tuning for BART on which P-TuningV2 is based. Maybe this issue can be closed?
Hey! Cool that you found something that works for you! The issue might just have been from a config parameter defining the hidden_size
Hello, thank you for replying. I will try out the modified config and see if it resolves the issue.
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.
@patrickvonplaten Hello, I am trying to implement P-Tuningv2 with BART using huggingface's transformers v4.25.1 (P-TuningV2 official repo). However, when I try to train the model I get the following error:
Any ideas where the issue is coming from or how to resolve this? I am a little unfamiliar with the codebase so any help will be greatly appreciated.
Thanks,
Here's the code I'm using to run the model: