File ~\anaconda3\lib\site-packages\transformers\trainer.py:1316, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1314 tr_loss_step = self.training_step(model, inputs)
1315 else:
-> 1316 tr_loss_step = self.training_step(model, inputs)
1318 if (
1319 args.logging_nan_inf_filter
1320 and not is_torch_tpu_available()
1321 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1322 ):
1323 # if loss is nan or inf simply add the average of previous logged losses
1324 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File ~\anaconda3\lib\site-packages\transformers\trainer.py:1849, in Trainer.training_step(self, model, inputs)
1847 loss = self.compute_loss(model, inputs)
1848 else:
-> 1849 loss = self.compute_loss(model, inputs)
1851 if self.args.n_gpu > 1:
1852 loss = loss.mean() # mean() to average on multi-gpu parallel training
Input In [17], in DistillationTrainer.compute_loss(self, model, inputs, return_outputs)
10 def compute_loss(self, model, inputs, return_outputs=False):
---> 11 outputs_stu = model(**inputs)
12 # Extract cross-entropy loss and logits from student
13 loss_ce = outputs_stu.loss
File ~\anaconda3\lib\site-packages\torch\nn\modules\module.py:1110, in Module._call_impl(self, *input, *kwargs)
1106 # If we don't have any hooks, we want to skip the rest of the logic in
1107 # this function, and just call forward.
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = [], []
File ~\anaconda3\lib\site-packages\transformers\models\distilbert\modeling_distilbert.py:729, in DistilBertForSequenceClassification.forward(self, input_ids, attention_mask, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)
721 r"""
722 labels (:obj:torch.LongTensor of shape :obj:(batch_size,), optional):
723 Labels for computing the sequence classification/regression loss. Indices should be in :obj:[0, ..., 724 config.num_labels - 1]. If :obj:config.num_labels == 1 a regression loss is computed (Mean-Square loss),
725 If :obj:config.num_labels > 1 a classification loss is computed (Cross-Entropy).
726 """
727 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--> 729 distilbert_output = self.distilbert(
730 input_ids=input_ids,
731 attention_mask=attention_mask,
732 head_mask=head_mask,
733 inputs_embeds=inputs_embeds,
734 output_attentions=output_attentions,
735 output_hidden_states=output_hidden_states,
736 return_dict=return_dict,
737 )
738 hidden_state = distilbert_output[0] # (bs, seq_len, dim)
739 pooled_output = hidden_state[:, 0] # (bs, dim)
File ~\anaconda3\lib\site-packages\torch\nn\modules\module.py:1110, in Module._call_impl(self, *input, *kwargs)
1106 # If we don't have any hooks, we want to skip the rest of the logic in
1107 # this function, and just call forward.
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = [], []
File ~\anaconda3\lib\site-packages\torch\nn\modules\module.py:1110, in Module._call_impl(self, *input, *kwargs)
1106 # If we don't have any hooks, we want to skip the rest of the logic in
1107 # this function, and just call forward.
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = [], []
File ~\anaconda3\lib\site-packages\torch\nn\modules\module.py:1110, in Module._call_impl(self, *input, *kwargs)
1106 # If we don't have any hooks, we want to skip the rest of the logic in
1107 # this function, and just call forward.
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = [], []
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)
Expected behavior
When I try to set device as "CPU" at the start of notebook even then I'm getting this issue.
Information
The problem arises in chapter:
Describe the bug
When I try to execute cell 26 in 08_model-compression.ipnyb on my local machine I'm getting the following error
To Reproduce
Steps to reproduce the behavior:
RuntimeError Traceback (most recent call last) Input In [28], in <cell line: 6>() 1 distilbert_trainer = DistillationTrainer(model_init=student_init, 2 teacher_model=teacher_model, args=student_training_args, 3 train_dataset=clinc_enc['train'], eval_dataset=clinc_enc['validation'], 4 compute_metrics=compute_metrics, tokenizer=student_tokenizer) ----> 6 distilbert_trainer.train()
File ~\anaconda3\lib\site-packages\transformers\trainer.py:1316, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 1314 tr_loss_step = self.training_step(model, inputs) 1315 else: -> 1316 tr_loss_step = self.training_step(model, inputs) 1318 if ( 1319 args.logging_nan_inf_filter 1320 and not is_torch_tpu_available() 1321 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) 1322 ): 1323 # if loss is nan or inf simply add the average of previous logged losses 1324 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File ~\anaconda3\lib\site-packages\transformers\trainer.py:1849, in Trainer.training_step(self, model, inputs) 1847 loss = self.compute_loss(model, inputs) 1848 else: -> 1849 loss = self.compute_loss(model, inputs) 1851 if self.args.n_gpu > 1: 1852 loss = loss.mean() # mean() to average on multi-gpu parallel training
Input In [17], in DistillationTrainer.compute_loss(self, model, inputs, return_outputs) 10 def compute_loss(self, model, inputs, return_outputs=False): ---> 11 outputs_stu = model(**inputs) 12 # Extract cross-entropy loss and logits from student 13 loss_ce = outputs_stu.loss
File ~\anaconda3\lib\site-packages\torch\nn\modules\module.py:1110, in Module._call_impl(self, *input, *kwargs) 1106 # If we don't have any hooks, we want to skip the rest of the logic in 1107 # this function, and just call forward. 1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1109 or _global_forward_hooks or _global_forward_pre_hooks): -> 1110 return forward_call(input, **kwargs) 1111 # Do not call functions when jit is used 1112 full_backward_hooks, non_full_backward_hooks = [], []
File ~\anaconda3\lib\site-packages\transformers\models\distilbert\modeling_distilbert.py:729, in DistilBertForSequenceClassification.forward(self, input_ids, attention_mask, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict) 721 r""" 722 labels (:obj:
torch.LongTensor
of shape :obj:(batch_size,)
,optional
): 723 Labels for computing the sequence classification/regression loss. Indices should be in :obj:[0, ..., 724 config.num_labels - 1]
. If :obj:config.num_labels == 1
a regression loss is computed (Mean-Square loss), 725 If :obj:config.num_labels > 1
a classification loss is computed (Cross-Entropy). 726 """ 727 return_dict = return_dict if return_dict is not None else self.config.use_return_dict --> 729 distilbert_output = self.distilbert( 730 input_ids=input_ids, 731 attention_mask=attention_mask, 732 head_mask=head_mask, 733 inputs_embeds=inputs_embeds, 734 output_attentions=output_attentions, 735 output_hidden_states=output_hidden_states, 736 return_dict=return_dict, 737 ) 738 hidden_state = distilbert_output[0] # (bs, seq_len, dim) 739 pooled_output = hidden_state[:, 0] # (bs, dim)File ~\anaconda3\lib\site-packages\torch\nn\modules\module.py:1110, in Module._call_impl(self, *input, *kwargs) 1106 # If we don't have any hooks, we want to skip the rest of the logic in 1107 # this function, and just call forward. 1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1109 or _global_forward_hooks or _global_forward_pre_hooks): -> 1110 return forward_call(input, **kwargs) 1111 # Do not call functions when jit is used 1112 full_backward_hooks, non_full_backward_hooks = [], []
File ~\anaconda3\lib\site-packages\transformers\models\distilbert\modeling_distilbert.py:550, in DistilBertModel.forward(self, input_ids, attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict) 547 head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) 549 if inputs_embeds is None: --> 550 inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim) 551 return self.transformer( 552 x=inputs_embeds, 553 attn_mask=attention_mask, (...) 557 return_dict=return_dict, 558 )
File ~\anaconda3\lib\site-packages\torch\nn\modules\module.py:1110, in Module._call_impl(self, *input, *kwargs) 1106 # If we don't have any hooks, we want to skip the rest of the logic in 1107 # this function, and just call forward. 1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1109 or _global_forward_hooks or _global_forward_pre_hooks): -> 1110 return forward_call(input, **kwargs) 1111 # Do not call functions when jit is used 1112 full_backward_hooks, non_full_backward_hooks = [], []
File ~\anaconda3\lib\site-packages\transformers\models\distilbert\modeling_distilbert.py:130, in Embeddings.forward(self, input_ids) 127 position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) 128 position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) --> 130 word_embeddings = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) 131 position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) 133 embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim)
File ~\anaconda3\lib\site-packages\torch\nn\modules\module.py:1110, in Module._call_impl(self, *input, *kwargs) 1106 # If we don't have any hooks, we want to skip the rest of the logic in 1107 # this function, and just call forward. 1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks 1109 or _global_forward_hooks or _global_forward_pre_hooks): -> 1110 return forward_call(input, **kwargs) 1111 # Do not call functions when jit is used 1112 full_backward_hooks, non_full_backward_hooks = [], []
File ~\anaconda3\lib\site-packages\torch\nn\modules\sparse.py:158, in Embedding.forward(self, input) 157 def forward(self, input: Tensor) -> Tensor: --> 158 return F.embedding( 159 input, self.weight, self.padding_idx, self.max_norm, 160 self.norm_type, self.scale_grad_by_freq, self.sparse)
File ~\anaconda3\lib\site-packages\torch\nn\functional.py:2183, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse) 2177 # Note [embedding_renorm set_grad_enabled] 2178 # XXX: equivalent to 2179 # with torch.no_grad(): 2180 # torch.embeddingrenorm 2181 # remove once script supports set_grad_enabled 2182 _no_grad_embeddingrenorm(weight, input, max_norm, norm_type) -> 2183 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)
Expected behavior
When I try to set device as "CPU" at the start of notebook even then I'm getting this issue.