allenai / longformer

Longformer: The Long-Document Transformer
https://arxiv.org/abs/2004.05150
Apache License 2.0
2k stars 268 forks source link

TypeError: forward() takes from 2 to 7 positional arguments but 8 were given #215

Open SCS2017 opened 2 years ago

SCS2017 commented 2 years ago

when I run the model,I got the error below.

Traceback (most recent call last): File "/Users/scs/Desktop/simbert-master/test_longformer.py", line 41, in output = model(input_ids, attention_mask=attention_mask)[0] File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl result = self.forward(*input, kwargs) File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py", line 815, in forward encoder_outputs = self.encoder( File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl result = self.forward(*input, *kwargs) File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py", line 508, in forward layer_outputs = layer_module( File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl result = self.forward(input, kwargs) File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py", line 395, in forward self_attention_outputs = self.attention( File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl result = self.forward(*input, *kwargs) File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/transformers/models/roberta/modeling_roberta.py", line 323, in forward self_outputs = self.self( File "/Users/scs/opt/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl result = self.forward(input, **kwargs) TypeError: forward() takes from 2 to 7 positional arguments but 8 were given

Dreamupers commented 2 years ago

I use the transformers 4.12.5 and have the same problem. Longformer inherits the RobertaModel class and the RobertaLayer is replaced by LongformerSelfAttention. The forward function parameters of the two classes are defined differently in the new version, resulting in this error. Add parameter past_key_value in forward of LongformerSelfAttention to solve the problem. Please refer https://github.com/huggingface/transformers/blob/4c0dd199c8305903564c2edeae23d294edd4b321/src/transformers/models/roberta/modeling_roberta.py#L532 for more information.

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value = None,
        output_attentions=False,
    ):
        '''
        The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to
            -ve: no attention
              0: local attention
            +ve: global attention
        '''
        assert encoder_hidden_states is None, "`encoder_hidden_states` is not supported and should be None"
        assert encoder_attention_mask is None, "`encoder_attention_mask` is not supported and should be None"
        assert past_key_value is None, "`past_key_value` is not used and should be None"
yusufcakmakk commented 2 years ago

I have tested it on 4.12.5 with the following solution and it works.

class RobertaLongSelfAttention(LongformerSelfAttention):
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value = None,
        output_attentions=False,
    ):
        attention_mask = attention_mask.squeeze(dim=2).squeeze(dim=1)
        is_index_masked = attention_mask < 0
        is_index_global_attn = attention_mask > 0
        is_global_attn = any(is_index_global_attn.flatten())
        return super().forward(hidden_states, 
                               is_index_masked=is_index_masked, 
                               is_index_global_attn=is_index_global_attn, 
                               is_global_attn=is_global_attn,
                               attention_mask=attention_mask, 
                               output_attentions=output_attentions)

class RobertaLongForMaskedLM(RobertaForMaskedLM):
    def __init__(self, config):
        super().__init__(config)
        for i, layer in enumerate(self.roberta.encoder.layer):
            layer.attention.self = RobertaLongSelfAttention(config, layer_id=i)
def create_long_model(save_model_to, attention_window, max_pos):
    model = RobertaForMaskedLM.from_pretrained(MODEL_PATH)
    tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_PATH, model_max_length=max_pos)
    config = model.config

    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape
    max_pos += 2  # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2
    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos
    # allocate a larger position embedding matrix
    new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(max_pos, embed_size)
    # copy position embeddings over and over to initialize the new position embeddings
    k = 2
    step = current_max_pos - 2
    while k < max_pos - 1:
        new_pos_embed[k:(k + step)] = model.roberta.embeddings.position_embeddings.weight[2:]
        k += step
    model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed
    model.roberta.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)

    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.roberta.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = layer.attention.self.query
        longformer_self_attn.key = layer.attention.self.key
        longformer_self_attn.value = layer.attention.self.value

        longformer_self_attn.query_global = copy.deepcopy(layer.attention.self.query)
        longformer_self_attn.key_global = copy.deepcopy(layer.attention.self.key)
        longformer_self_attn.value_global = copy.deepcopy(layer.attention.self.value)

        layer.attention.self = longformer_self_attn

    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer
dysby commented 1 year ago

According to PR 5811

is_global_attn = any(is_index_global_attn.flatten())

should be

is_global_attn = is_index_global_attn.flatten().any().item()