RuntimeError at loss.backward() at the second batch

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

I met the above error when I simply changed the last MLP network to the tcnn-NGP network. I printed the batchidx before backward and found the error occurred at loss.backward() at the second batch. However, the error disappeared when using MLP.

batch_idx 0
batch_idx 1
Traceback (most recent call last):
  File "train.py", line 32, in <module>
    main()
  File "train.py", line 25, in main
    trainer.train(epoch=epoch,
  File "core/train/trainers/projectx/trainer.py", line 193, in train
    train_loss.backward()
  File "/root/miniconda3/envs/ngp/lib/python3.8/site-packages/torch/_tensor.py", line 488, in backward
    torch.autograd.backward(
  File "/root/miniconda3/envs/ngp/lib/python3.8/site-packages/torch/autograd/__init__.py", line 197, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

I guess it's because some tensors in tcnn still need to calculate gradients when calling backward for the second time. However, when I tried to print the attributes that require gradients, I found that there were no tensors that required gradients except for two networks.

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.shape)

NeRFNGPNet(
  (encoder): NetworkWithInputEncoding(n_input_dims=3, n_output_dims=16, seed=1337, dtype=torch.float16, hyperparams={'encoding': {'base_resolution': 16, 'hash': 'CoherentPrime', 'interpolation': 'Linear', 'log2_hashmap_size': 19, 'n_features_per_level': 2, 'n_levels': 16, 'otype': 'Grid', 'per_level_scale': 1.5, 'type': 'Hash'}, 'network': {'activation': 'ReLU', 'n_hidden_layers': 1, 'n_neurons': 64, 'otype': 'FullyFusedMLP', 'output_activation': 'None'}, 'otype': 'NetworkWithInputEncoding'})
  (color_net): Network(n_input_dims=15, n_output_dims=3, seed=1337, dtype=torch.float16, hyperparams={'encoding': {'offset': 0.0, 'otype': 'Identity', 'scale': 1.0}, 'network': {'activation': 'ReLU', 'n_hidden_layers': 2, 'n_neurons': 64, 'otype': 'FullyFusedMLP', 'output_activation': 'Sigmoid'}, 'otype': 'NetworkWithInputEncoding'})
)

The following is my code.

class NeRFNGPNet(nn.Module):
    def __init__(self):
        super().__init__()

        self.encoder =  tcnn.NetworkWithInputEncoding(
            n_input_dims=3,
            n_output_dims=16,
            encoding_config={
                "otype": "HashGrid",
                "n_levels": 16,
                "n_features_per_level": 2,
                "log2_hashmap_size": 19,
                "base_resolution": 16,
                "per_level_scale": 1.5,
            },
            network_config={
                "otype": "FullyFusedMLP",
                "activation": "ReLU",
                "output_activation": "None",
                "n_neurons": 64,
                "n_hidden_layers": 1,
            }
        )

        self.color_net = tcnn.Network(
            n_input_dims=15,
            n_output_dims=3,
            network_config={
                "otype": "FullyFusedMLP",
                "activation": "ReLU",
                "output_activation": "Sigmoid",
                "n_neurons": 64,
                "n_hidden_layers": 2,
            },
        )

        self.register_buffer("center", torch.FloatTensor([0, -0.3, 0]))
        self.register_buffer("scale", torch.FloatTensor([2.5, 2.5, 2.5]))

        self.use_viewdir = False
        self.cond_dim = 0

    def initialize(self, bbox):
        if hasattr(self, "bbox"):
            return
        c = (bbox[0] + bbox[1]) / 2
        s = (bbox[1] - bbox[0])
        self.center = c
        self.scale = s
        self.bbox = bbox

    def forward(self, x, d=None, cond=None):
        x = (x - self.center) / self.scale + 0.5
        assert x.min() >= -EPS and x.max() < 1 + EPS
        x = x.clamp(min=0, max=1)
        out = self.encoder(x)
        sigma = out[..., 0]
        color = self.color_net(out[..., 1:]).float()
        # sigma = self.sigma_activ(sigma)
        return color, sigma.float()

NVlabs / tiny-cuda-nn

RuntimeError at loss.backward() at the second batch #349