Lightning-AI / pytorch-lightning

Pretrain, finetune and deploy AI models on multiple GPUs, TPUs with zero code changes.
https://lightning.ai
Apache License 2.0
28.02k stars 3.36k forks source link

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn #2331

Closed soulhi-vz closed 4 years ago

soulhi-vz commented 4 years ago

Hi,

Got RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Below details:

class Autoencoder(pl.LightningModule):

    def __init__(self, hparams: argparse.Namespace):
        super(Autoencoder,self).__init__() 
        self.hparams = hparams

        self.layer_e_1 = nn.Conv1d(hparams.in_channels, hparams.out_channels, hparams.kernel_size)
        self.layer_e_2 = nn.Conv1d(hparams.out_channels,hparams.in_channels,hparams.kernel_size)
        self.layer_d_1 = nn.ConvTranspose1d(hparams.in_channels,hparams.out_channels,hparams.kernel_size)
        self.layer_d_2 = nn.ConvTranspose1d(hparams.out_channels,hparams.in_channels,hparams.kernel_size)

        Train_x_t_e = CarrierDataset()

    def forward(self,x):
        x = self.layer_e_1(x)
        x = F.relu(x)
        x = self.layer_e_2(x)
        encoded = F.relu(x)
        x = self.layer_d_1(encoded)
        x = F.relu(x)
        decoded = self.layer_d_2(x)
        return decoded, encoded

    def training_step(self, train_batch, batch_idx):
        x, _ = train_batch
        decoded, encoded = self.forward(x)
        mse = nn.MSELoss()
        loss = mse(x, decoded)
        return {'loss': loss}

    def validation_step(self,val_batch, batch_idx):
        x, _ = val_batch
        decoded, encoded = self.forward(x)
        mse = nn.MSELoss()
        loss = mse(x, decoded)
        return {'val_loss': loss}

    def train_dataloader(self):
        loader = torch.utils.data.DataLoader(
            dataset=Train_x_t_e,
            batch_size=self.hparams.batch_size,
            shuffle=True,
            pin_memory=True)
        return loader

    def val_dataloader(self):
        return DataLoader(dataset=Train_x_t_e, batch_size=self.hparams.batch_size)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        return optimizer

class CarrierDataset(data.Dataset):

    def __init__(self):
        Train_x_t_e1 = torch.load('Tensor_100K_1.pt')
        Train_x_t_e2 = torch.load('Tensor_100K_2.pt')
        ag_t = torch.cat((Train_x_t_e1,Train_x_t_e2))
        #df = read_csv(ds)
        self.len = ag_t.shape[0]
        self.ag_t = ag_t
        self.size = sys.getsizeof(ag_t)

    def __getitem__(self,index):
        return self.ag_t[index], self.ag_t[index]

    def __len__(self):
        return self.len

    def __size__(self):
        return self.size    

Got the following error:

RuntimeErrorTraceback (most recent call last)
<ipython-input-277-b675dec29cfe> in <module>
     16 print("Parameters:")
     17 print(args)
---> 18 main(args)

<ipython-input-276-23c135174ba6> in main(hparams)
     19     )
     20 
---> 21     trainer.fit(model)

/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders)
    885             self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)
    886 
--> 887             self.run_pretrain_routine(model)
    888 
    889         # return 1 when finished

/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in run_pretrain_routine(self, model)
   1013 
   1014         # CORE TRAINING LOOP
-> 1015         self.train()
   1016 
   1017     def test(

/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py in train(self)
    345                 # RUN TNG EPOCH
    346                 # -----------------
--> 347                 self.run_training_epoch()
    348 
    349                 # update LR schedulers

/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py in run_training_epoch(self)
    417             # RUN TRAIN STEP
    418             # ---------------
--> 419             _outputs = self.run_training_batch(batch, batch_idx)
    420             batch_result, grad_norm_dic, batch_step_metrics, batch_output = _outputs
    421 

/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py in run_training_batch(self, batch, batch_idx)
    595 
    596                 # calculate loss
--> 597                 loss, batch_output = optimizer_closure()
    598 
    599                 # check if loss or model weights are nan

/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py in optimizer_closure()
    573                     model_ref = self.get_model()
    574                     with self.profiler.profile('model_backward'):
--> 575                         model_ref.backward(self, closure_loss, optimizer, opt_idx)
    576 
    577                     # track metrics for callbacks

/opt/conda/lib/python3.7/site-packages/pytorch_lightning/core/hooks.py in backward(self, trainer, loss, optimizer, optimizer_idx)
    153                     scaled_loss.backward()
    154         else:
--> 155             loss.backward()

/opt/conda/lib/python3.7/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    196                 products. Defaults to ``False``.
    197         """
--> 198         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    199 
    200     def register_hook(self, hook):

/opt/conda/lib/python3.7/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     98     Variable._execution_engine.run_backward(
     99         tensors, grad_tensors, retain_graph, create_graph,
--> 100         allow_unreachable=True)  # allow_unreachable flag
    101 
    102 

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
rohitgr7 commented 4 years ago

Try: loss = mse(decoded, x).

soulhi-vz commented 4 years ago

with "loss = mse(decoded, x)" same issue: RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

rohitgr7 commented 4 years ago

Can you print decoded.requires_grad after self.forward(x) in training_step? See whether it's True or False.

soulhi-vz commented 4 years ago

Hi Rohit, Got: decoded.requires_grad: False

rohitgr7 commented 4 years ago

That means no problem in the loss.backward() as of now. Can you check the same after decoded = self.layer_d_2(x)? Or share a colab notebook?

soulhi-vz commented 4 years ago

decoded.requires_grad after layer_d_2: False decoded.requires_grad: False

rohitgr7 commented 4 years ago

decoded.requires_grad should be True. Not sure why it's happening. Mind share a colab notebook??

soulhi-vz commented 4 years ago

Hi Rohit, I am not using colab. Here the rest of the code:

def main(hparams) -> None:

initialize the DQNLightning

model = Autoencoder(hparams)
print("Model")
print(model)
print("hparam")
print(hparams)
#print("params list")
#for parameter in model.parameters():
#    print(parameter)
trainer = pl.Trainer(
    fast_dev_run = True
    #gpus=10,
    #distributed_backend='dp',
    #max_epochs=500,
    #early_stop_callback=False,
    #val_check_interval=100,
    #show_progress_bar=False
)

trainer.fit(model)

torch.manual_seed(0) np.random.seed(0)

Pass hyper parameters to the model through the arg parser

Create parser

argeparse used to write freindly command-line interface

parser = argparse.ArgumentParser()

Add arguments

parser.add_argument("--batch_size", type=int, default=1024, help="size of the batches") parser.add_argument("--lr", type=float, default=1e-2, help="learning rate") parser.add_argument("--in_channels", type=int, default=17, help="in channels") parser.add_argument("--out_channels", type=int, default=100, help="out channels") parser.add_argument("--kernel_size", type=int, default=3, help="kernel size")

Parse arguments

args, _ = parser.parse_known_args() print("Parameters:") print(args) main(args)

soulhi-vz commented 4 years ago

Hi Rohit, After restarting the kernel, the issue went away. Thanks for your help. Will close the issue now. /Said

JonnyD1117 commented 3 years ago

I have a similar problem connected to using pl.metrics.functional package for the 'dice_score'. Model was previously training fine with BCELoss, but when I switched to dice_score I get this same error.

Does this mean that PL version of the dice_score is not capable of back-prop and can only be used as a fitness measure and not a loss function?