Training Error with an Attribute error

harishsg999 commented 2 weeks ago

AttributeError: 'DataParallel' object has no attribute 'device'

Traceback: Skipping iteration due to error: Caught RuntimeError in replica 0 on device 0. Original Traceback (most recent call last): File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in _worker output = module(*input, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, *kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/gliner/model.py", line 72, in forward output = self.model(*args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, *kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/gliner/modeling/base.py", line 149, in forward span_rep = self.span_rep_layer(words_embedding, span_idx) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(*args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/gliner/modeling/span_rep.py", line 356, in forward return self.span_rep_layer(x, args) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(*args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/gliner/modeling/span_rep.py", line 286, in forward return self.out_project(cat).view(B, L, self.max_width, D) RuntimeError: shape '[8, 11, 12, 512]' is invalid for input of size 589824

RuntimeError Traceback (most recent call last) File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/gliner/training/trainer.py:57, in Trainer.training_step(self, model, inputs) 56 with self.compute_loss_context_manager(): ---> 57 loss = self.compute_loss(model, inputs) 59 del inputs

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/gliner/training/trainer.py:92, in Trainer.compute_loss(self, model, inputs) 91 # Forward pass ---> 92 outputs = model(alpha = self.args.focal_loss_alpha, 93 gamma = self.args.focal_loss_gamma, 94 label_smoothing = self.args.label_smoothing, 95 reduction = self.args.loss_reduction, 96 **inputs) 97 loss = outputs.loss

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, *kwargs) 1517 else: -> 1518 return self._call_impl(args, **kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, *kwargs) 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(args, **kwargs) 1529 try:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:185, in DataParallel.forward(self, *inputs, **kwargs) 184 replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) --> 185 outputs = self.parallel_apply(replicas, inputs, module_kwargs) 186 return self.gather(outputs, self.output_device)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:200, in DataParallel.parallel_apply(self, replicas, inputs, kwargs) 199 def parallel_apply(self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any) -> List[Any]: --> 200 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py:110, in parallel_apply(modules, inputs, kwargs_tup, devices) 109 if isinstance(output, ExceptionWrapper): --> 110 output.reraise() 111 outputs.append(output)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/_utils.py:694, in ExceptionWrapper.reraise(self) 693 raise RuntimeError(msg) from None --> 694 raise exception

RuntimeError: Caught RuntimeError in replica 0 on device 0. Original Traceback (most recent call last): File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in _worker output = module(*input, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, *kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/gliner/model.py", line 72, in forward output = self.model(*args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, *kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/gliner/modeling/base.py", line 149, in forward span_rep = self.span_rep_layer(words_embedding, span_idx) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(*args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/gliner/modeling/span_rep.py", line 356, in forward return self.span_rep_layer(x, args) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(*args, kwargs) File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/gliner/modeling/span_rep.py", line 286, in forward return self.out_project(cat).view(B, L, self.max_width, D) RuntimeError: shape '[8, 11, 12, 512]' is invalid for input of size 589824

During handling of the above exception, another exception occurred:

AttributeError Traceback (most recent call last) Cell In[17], line 28 1 training_args = TrainingArguments( 2 output_dir="models", 3 learning_rate=5e-6, (...) 17 report_to="none", 18 ) 20 trainer = Trainer( 21 model=model, 22 args=training_args, (...) 26 data_collator=data_collator, 27 ) ---> 28 trainer.train()

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/trainer.py:1885, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 1883 hf_hub_utils.enable_progress_bars() 1884 else: -> 1885 return inner_training_loop( 1886 args=args, 1887 resume_from_checkpoint=resume_from_checkpoint, 1888 trial=trial, 1889 ignore_keys_for_eval=ignore_keys_for_eval, 1890 )

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/trainer.py:2216, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval) 2213 self.control = self.callback_handler.on_step_begin(args, self.state, self.control) 2215 with self.accelerator.accumulate(model): -> 2216 tr_loss_step = self.training_step(model, inputs) 2218 if ( 2219 args.logging_nan_inf_filter 2220 and not is_torch_xla_available() 2221 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) 2222 ): 2223 # if loss is nan or inf simply add the average of previous logged losses 2224 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/gliner/training/trainer.py:82, in Trainer.training_step(self, model, inputs) 80 model.zero_grad(set_to_none=True) 81 torch.cuda.empty_cache() ---> 82 return torch.tensor(0.0, requires_grad=True).to(model.device)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/nn/modules/module.py:1695, in Module.getattr(self, name) 1693 if name in modules: 1694 return modules[name] -> 1695 raise AttributeError(f"'{type(self).name}' object has no attribute '{name}'")

AttributeError: 'DataParallel' object has no attribute 'device'

Ingvarstep commented 1 week ago

@harishsg999 , sorry for the delay in reply. Can you provide more information that can help me reproduce the issue?

quannguyen268 commented 1 week ago

@Ingvarstep I have the same issue. Information to reproduce: gliner==0.2.7 transformers==4.42.3 GPU: 2xA30

Code: https://github.com/urchade/GLiNER/blob/main/examples/finetune.ipynb

urchade / GLiNER

Training Error with an Attribute error #133