clinicalml / TabLLM

MIT License
265 stars 42 forks source link

Element 0 of tensor does not require grad and does not have a grad_fn #12

Closed YasHGoyaL27 closed 10 months ago

YasHGoyaL27 commented 11 months ago

I am getting following error while trying to replicate your code

Version used: torch == 2.0.1 pytorch_lightning == 1.9.2 deepspeed == 0.10.3

11,402.764Total estimated model params size (MB)

Epoch 0: 0%| | 0/8 [00:00<?, ?it/s]Traceback (most recent call last):

File “/code/llm/t_few/src/pl_train.py", line 139, in

main(config)

File "code/llm/t_few/src/pl_train.py", line 99, in main

trainer.fit(model, datamodule)

File "python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 768, in fit

self._call_and_handle_interrupt(

File "python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 721, in _call_and_handle_interrupt

return trainer_fn(*args, **kwargs)

File "python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in _fit_impl

results = self._run(model, ckpt_path=self.ckpt_path)

File “python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1234, in _run

results = self._run_stage()

File "python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1321, in _run_stage

return self._run_train()

File "python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1351, in _run_train

self.fit_loop.run()

File "python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run

self.advance(*args, **kwargs)

File "python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 269, in advance

self._outputs = self.epoch_loop.run(self._data_fetcher)

File "python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run

self.advance(*args, **kwargs)

File "python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 208, in advance

batch_output = self.batch_loop.run(batch, batch_idx)

File "python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run

self.advance(*args, **kwargs)

File "python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance

outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)

File "python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run

self.advance(*args, **kwargs)

File "python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 203, in advance

result = self._run_optimization(

File "python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 256, in _run_optimization

self._optimizer_step(optimizer, opt_idx, batch_idx, closure)

File "python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 369, in _optimizer_step

self.trainer._call_lightning_module_hook(

File “python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1593, in _call_lightning_module_hook

output = fn(*args, **kwargs)

File "python3.9/site-packages/pytorch_lightning/core/lightning.py", line 1625, in optimizer_step

optimizer.step(closure=optimizer_closure)

File "python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step

step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)

File "python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 193, in optimizer_step

return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)

File “python3.9/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 80, in optimizer_step

return super().optimizer_step(model, optimizer, optimizer_idx, closure, **kwargs)

File "python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 155, in optimizer_step

return optimizer.step(closure=closure, **kwargs)

File "python3.9/site-packages/torch/optim/lr_scheduler.py", line 65, in wrapper

return wrapped(*args, **kwargs)

File "python3.9/site-packages/torch/optim/optimizer.py", line 88, in wrapper

return func(*args, **kwargs)

File “python3.9/site-packages/torch/autograd/grad_mode.py", line 28, in decorate_context

return func(*args, **kwargs)

File "python3.9/site-packages/transformers/optimization.py", line 649, in step

loss = closure()

File "python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 140, in _wrap_closure

closure_result = closure()

File "python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 148, in call

self._result = self.closure(*args, **kwargs)

File "python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 143, in closure

self._backward_fn(step_output.closure_loss)

File "python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 311, in backward_fn

self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx)

File “python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1763, in _call_strategy_hook

output = fn(*args, **kwargs)

File "python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 168, in backward

self.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs)

File "python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 80, in backward

model.backward(closure_loss, optimizer, *args, **kwargs)

File "python3.9/site-packages/pytorch_lightning/core/lightning.py", line 1370, in backward

loss.backward(*args, **kwargs)

File "python3.9/site-packages/torch/_tensor.py", line 307, in backward

torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)

File "python3.9/site-packages/torch/autograd/init.py", line 154, in backward

Variable._execution_engine.run_backward(

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Epoch 0: 0%| | 0/8 [03:19<?, ?it/s]

stefanhgm commented 11 months ago

Hello @YasHGoyaL27,

thanks for reaching out and using our code!

This seems like a problem with the backward pass during training, which is a rather general error.

One thing I can spotted is that your PyTorch version does not seem to match the one we were using (2.0.1 versus 1.10.1). Could you please try to follow exactly the instructions in the readme to get the necessary environment?

Please let us know if you need any further help!

Did you follow all steps in the readme and which model are you running?

YasHGoyaL27 commented 11 months ago

Thank you for your response I was able to run it by correcting the versions

stefanhgm commented 10 months ago

Great! Thanks for the feedback.