Issue with torch in BertLMLearner

bhavsarpratik commented 4 years ago

I am getting this issue with BertLMLearner and not with BertLearner. I tried to debug a lot and also changed cuda versions but couldn't make it work. I am not getting this error when I use run_language_modeling.py of transformer.

RuntimeError Traceback (most recent call last)

in () 3 validate=True, # Evaluate the model after each epoch 4 schedule_type="warmup_cosine", ----> 5 optimizer_type="lamb", 6 ) 15 frames /usr/local/lib/python3.6/dist-packages/fast_bert/learner_lm.py in fit(self, epochs, lr, validate, schedule_type, optimizer_type) 141 self.model.train() 142 --> 143 outputs = self.model(inputs, masked_lm_labels=labels) 144 loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) 145 /usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs) 530 result = self._slow_forward(*input, **kwargs) 531 else: --> 532 result = self.forward(*input, **kwargs) 533 for hook in self._forward_hooks.values(): 534 hook_result = hook(self, input, result) /usr/local/lib/python3.6/dist-packages/transformers/modeling_roberta.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, masked_lm_labels) 230 position_ids=position_ids, 231 head_mask=head_mask, --> 232 inputs_embeds=inputs_embeds, 233 ) 234 sequence_output = outputs[0] /usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs) 530 result = self._slow_forward(*input, **kwargs) 531 else: --> 532 result = self.forward(*input, **kwargs) 533 for hook in self._forward_hooks.values(): 534 hook_result = hook(self, input, result) /usr/local/lib/python3.6/dist-packages/transformers/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask) 788 head_mask=head_mask, 789 encoder_hidden_states=encoder_hidden_states, --> 790 encoder_attention_mask=encoder_extended_attention_mask, 791 ) 792 sequence_output = encoder_outputs[0] /usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs) 530 result = self._slow_forward(*input, **kwargs) 531 else: --> 532 result = self.forward(*input, **kwargs) 533 for hook in self._forward_hooks.values(): 534 hook_result = hook(self, input, result) /usr/local/lib/python3.6/dist-packages/transformers/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask) 405 406 layer_outputs = layer_module( --> 407 hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask 408 ) 409 hidden_states = layer_outputs[0] /usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs) 530 result = self._slow_forward(*input, **kwargs) 531 else: --> 532 result = self.forward(*input, **kwargs) 533 for hook in self._forward_hooks.values(): 534 hook_result = hook(self, input, result) /usr/local/lib/python3.6/dist-packages/transformers/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask) 366 encoder_attention_mask=None, 367 ): --> 368 self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask) 369 attention_output = self_attention_outputs[0] 370 outputs = self_attention_outputs[1:] # add self attentions if we output attention weights /usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs) 530 result = self._slow_forward(*input, **kwargs) 531 else: --> 532 result = self.forward(*input, **kwargs) 533 for hook in self._forward_hooks.values(): 534 hook_result = hook(self, input, result) /usr/local/lib/python3.6/dist-packages/transformers/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask) 312 ): 313 self_outputs = self.self( --> 314 hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask 315 ) 316 attention_output = self.output(self_outputs[0], hidden_states) /usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs) 530 result = self._slow_forward(*input, **kwargs) 531 else: --> 532 result = self.forward(*input, **kwargs) 533 for hook in self._forward_hooks.values(): 534 hook_result = hook(self, input, result) /usr/local/lib/python3.6/dist-packages/transformers/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask) 214 encoder_attention_mask=None, 215 ): --> 216 mixed_query_layer = self.query(hidden_states) 217 218 # If this is instantiated as a cross-attention module, the keys /usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs) 530 result = self._slow_forward(*input, **kwargs) 531 else: --> 532 result = self.forward(*input, **kwargs) 533 for hook in self._forward_hooks.values(): 534 hook_result = hook(self, input, result) /usr/local/lib/python3.6/dist-packages/torch/nn/modules/linear.py in forward(self, input) 85 86 def forward(self, input): ---> 87 return F.linear(input, self.weight, self.bias) 88 89 def extra_repr(self): /usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in linear(input, weight, bias) 1370 ret = torch.addmm(bias, input, weight.t()) 1371 else: -> 1372 output = input.matmul(weight.t()) 1373 if bias is not None: 1374 output += bias RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`

kinoc commented 4 years ago

Experienced same error. Switching to bert produces an IndexError....

File "Initial_test_k_lm.py", line 198, in optimizer_type="adamw") #adamw /lamb File "/home/kino/.local/lib/python3.6/site-packages/fast_bert/learner_lm.py", line 143, in fit outputs = self.model(inputs, masked_lm_labels=labels) File "/home/kino/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in call result = self.forward(*input, *kwargs) File "/home/kino/.local/lib/python3.6/site-packages/transformers/modeling_bert.py", line 1003, in forward masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) File "/home/kino/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in call result = self.forward(input, **kwargs) File "/home/kino/.local/lib/python3.6/site-packages/torch/nn/modules/loss.py", line 916, in forward ignore_index=self.ignore_index, reduction=self.reduction) File "/home/kino/.local/lib/python3.6/site-packages/torch/nn/functional.py", line 2021, in cross_entropy return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction) File "/home/kino/.local/lib/python3.6/site-packages/torch/nn/functional.py", line 1838, in nll_loss ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index) IndexError: Target -1 is out of bounds.

kinoc commented 4 years ago

See : Version 2.4.1 breaks run_lm_finetuning.py, version 2.3.0 runs fine

For us in fast-bert , modifying the function mask_tokens in data_lm.py:

labels[~masked_indices] = -1 to this line labels[~masked_indices] = -100

restores training operation locally for both cuda and cpu. See:Breaking Changes:Ignored indices in PyTorch loss computing (@LysandreJik)

Please fix , please ...

neel04 commented 3 years ago

Same issue. Can anyone help?

utterworks / fast-bert

Issue with torch in BertLMLearner #179