NVlabs / instant-ngp

Instant neural graphics primitives: lightning fast NeRF and more
https://nvlabs.github.io/instant-ngp
Other
15.86k stars 1.91k forks source link

Zero samples got into RuntimeError #1536

Closed shyakocat closed 5 months ago

shyakocat commented 5 months ago

When I train on a custom dataset, I got

Traceback (most recent call last):
  File "train.py", line 412, in <module>
    trainer.fit(system, ckpt_path=hparams.ckpt_path)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 608, in fit
    call._call_and_handle_interrupt(
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _fit_impl
    self._run(model, ckpt_path=self.ckpt_path)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1112, in _run
    results = self._run_stage()
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1191, in _run_stage
    self._run_train()
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1214, in _run_train
    self.fit_loop.run()
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 267, in advance
    self._outputs = self.epoch_loop.run(self._data_fetcher)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 213, in advance
    batch_output = self.batch_loop.run(kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
    outputs = self.optimizer_loop.run(optimizers, kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 202, in advance
    result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 249, in _run_optimization
    self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 370, in _optimizer_step
    self.trainer._call_lightning_module_hook(
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1356, in _call_lightning_module_hook
    output = fn(*args, **kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/core/module.py", line 1742, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 169, in step
    step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 234, in optimizer_step
    return self.precision_plugin.optimizer_step(
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/native_amp.py", line 75, in optimizer_step
    closure_result = closure()
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 149, in __call__
    self._result = self.closure(*args, **kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 135, in closure
    step_output = self._step_fn()
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 419, in _training_step
    training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values())
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1494, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 378, in training_step
    return self.model.training_step(*args, **kwargs)
  File "train.py", line 209, in training_step
    results = self(batch, split='train')
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "train.py", line 123, in forward
    return render(self.model, rays_o, rays_d, **kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/torch/autocast_mode.py", line 12, in decorate_autocast
    return func(*args, **kwargs)
  File "/root/shy/OmniSeg3D/models/rendering.py", line 35, in render
    results = render_func(model, rays_o, rays_d, hits_t, **kwargs)
  File "/root/shy/OmniSeg3D/models/rendering.py", line 196, in __render_rays_train
    sigmas, rgbs = model(xyzs, dirs, **kwargs)
  File "/root/anaconda3/envs/omniseg3d/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/root/shy/OmniSeg3D/models/networks.py", line 190, in forward
    sigmas, h = self.density(x, return_feat=True)
  File "/root/shy/OmniSeg3D/models/networks.py", line 150, in density
    inside_aabb = ((x>=self.aabb_min-self.aabb_tol)&
RuntimeError: CUDA error: invalid configuration argument

the error occurs at models/networks.py

    def density(self, x, return_feat=False):
        """
        Inputs:
            x: (N, 3) xyz in [-scale, scale]
            return_feat: whether to return intermediate feature

        Outputs:
            sigmas: (N)
        """
        x1 = (x-self.xyz_min)/(self.xyz_max-self.xyz_min)
        # h = self.xyz_encoder(x)
        e = self.xyz_encoder(x1)
        h = self.sigma_net(e)
        sigmas = TruncExp.apply(h[:, 0])
 ->   inside_aabb = ((x>=self.aabb_min-self.aabb_tol)&
                       (x<=self.aabb_max+self.aabb_tol)).all(1)
        sigmas = torch.where(inside_aabb, sigmas, torch.tensor(0.0).float().to(sigmas.device))
        if return_feat: return sigmas, h
        # if return_feat: return sigmas, h, e
        return sigmas

because x is torch.tensor([], size=(0,3)). And why x is empty. Because in models/rendering.py

    (rays_a, xyzs, dirs,
    results['deltas'], results['ts'], results['rm_samples']) = \
        RayMarcher.apply(
            rays_o, rays_d, hits_t[:, 0], model.density_bitfield,
            model.cascades, model.scale,
            exp_step_factor, model.grid_size, MAX_SAMPLES)

returns xyzs is empty, also dirs is empty. And why they are empty. Because in models/custom_functions.py

    @staticmethod
    @custom_fwd(cast_inputs=torch.float32)
    def forward(ctx, rays_o, rays_d, hits_t,
                density_bitfield, cascades, scale, exp_step_factor,
                grid_size, max_samples):
        # noise to perturb the first sample of each ray
        noise = torch.rand_like(rays_o[:, 0])

        rays_a, xyzs, dirs, deltas, ts, counter = \
            vren.raymarching_train(
                rays_o, rays_d, hits_t,
                density_bitfield, cascades, scale,
                exp_step_factor, noise, grid_size, max_samples)

  ->  total_samples = counter[0] # total samples for all rays
        # remove redundant output
        xyzs = xyzs[:total_samples]
        dirs = dirs[:total_samples]
        deltas = deltas[:total_samples]
        ts = ts[:total_samples]

        ctx.save_for_backward(rays_a, ts)

        return rays_a, xyzs, dirs, deltas, ts, total_samples

the total_samples is zero, so that it slice an empty tensor. And why vren.raymarching_train return counter[0] which is 0. Because in models/csrc/raymarching.cu

    // first pass: compute the number of samples on the ray
    float t = t1; int N_samples = 0;

    // if t1 < 0 (no hit) this loop will be skipped (N_samples will be 0)
    while (0<=t && t<t2 && N_samples<max_samples){
        const float x = ox+t*dx, y = oy+t*dy, z = oz+t*dz;

...

there is no hit, so the sample count is 0.

My dataset is a single object, the background is empty, so some rays hit nothing.

But why it got into RuntimeError? I think NeRF can deal with these situation.