Open SCS2017 opened 2 years ago
I use the transformers 4.12.5 and have the same problem. Longformer
inherits the RobertaModel
class and the RobertaLayer
is replaced by LongformerSelfAttention
. The forward
function parameters of the two classes are defined differently in the new version, resulting in this error. Add parameter past_key_value
in forward
of LongformerSelfAttention
to solve the problem. Please refer https://github.com/huggingface/transformers/blob/4c0dd199c8305903564c2edeae23d294edd4b321/src/transformers/models/roberta/modeling_roberta.py#L532 for more information.
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value = None,
output_attentions=False,
):
'''
The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to
-ve: no attention
0: local attention
+ve: global attention
'''
assert encoder_hidden_states is None, "`encoder_hidden_states` is not supported and should be None"
assert encoder_attention_mask is None, "`encoder_attention_mask` is not supported and should be None"
assert past_key_value is None, "`past_key_value` is not used and should be None"
I have tested it on 4.12.5 with the following solution and it works.
class RobertaLongSelfAttention(LongformerSelfAttention):
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value = None,
output_attentions=False,
):
attention_mask = attention_mask.squeeze(dim=2).squeeze(dim=1)
is_index_masked = attention_mask < 0
is_index_global_attn = attention_mask > 0
is_global_attn = any(is_index_global_attn.flatten())
return super().forward(hidden_states,
is_index_masked=is_index_masked,
is_index_global_attn=is_index_global_attn,
is_global_attn=is_global_attn,
attention_mask=attention_mask,
output_attentions=output_attentions)
class RobertaLongForMaskedLM(RobertaForMaskedLM):
def __init__(self, config):
super().__init__(config)
for i, layer in enumerate(self.roberta.encoder.layer):
layer.attention.self = RobertaLongSelfAttention(config, layer_id=i)
def create_long_model(save_model_to, attention_window, max_pos):
model = RobertaForMaskedLM.from_pretrained(MODEL_PATH)
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_PATH, model_max_length=max_pos)
config = model.config
# extend position embeddings
tokenizer.model_max_length = max_pos
tokenizer.init_kwargs['model_max_length'] = max_pos
current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape
max_pos += 2 # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2
config.max_position_embeddings = max_pos
assert max_pos > current_max_pos
# allocate a larger position embedding matrix
new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(max_pos, embed_size)
# copy position embeddings over and over to initialize the new position embeddings
k = 2
step = current_max_pos - 2
while k < max_pos - 1:
new_pos_embed[k:(k + step)] = model.roberta.embeddings.position_embeddings.weight[2:]
k += step
model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed
model.roberta.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)
# replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
config.attention_window = [attention_window] * config.num_hidden_layers
for i, layer in enumerate(model.roberta.encoder.layer):
longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
longformer_self_attn.query = layer.attention.self.query
longformer_self_attn.key = layer.attention.self.key
longformer_self_attn.value = layer.attention.self.value
longformer_self_attn.query_global = copy.deepcopy(layer.attention.self.query)
longformer_self_attn.key_global = copy.deepcopy(layer.attention.self.key)
longformer_self_attn.value_global = copy.deepcopy(layer.attention.self.value)
layer.attention.self = longformer_self_attn
logger.info(f'saving model to {save_model_to}')
model.save_pretrained(save_model_to)
tokenizer.save_pretrained(save_model_to)
return model, tokenizer
when I run the model,I got the error below.
Traceback (most recent call last): File "/Users/scs/Desktop/simbert-master/test_longformer.py", line 41, in
output = model(input_ids, attention_mask=attention_mask)[0]
File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, kwargs)
File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py", line 815, in forward
encoder_outputs = self.encoder(
File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, *kwargs)
File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py", line 508, in forward
layer_outputs = layer_module(
File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(input, kwargs)
File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py", line 395, in forward
self_attention_outputs = self.attention(
File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, *kwargs)
File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py", line 323, in forward
self_outputs = self.self(
File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(input, **kwargs)
TypeError: forward() takes from 2 to 7 positional arguments but 8 were given