nerfstudio-project / gsplat

CUDA accelerated rasterization of gaussian splatting
https://docs.gsplat.studio/
Apache License 2.0
2.29k stars 296 forks source link

Gsplat >= 1.0 no longer supports older gpus #401

Open pablovela5620 opened 2 months ago

pablovela5620 commented 2 months ago

I have an older 1080ti that I use and found that changing from gsplat v0.1.12 to >=v1.0.0 I can no longer use the gsplat library, I don't know if there are plans to support older gpus or if there's anyway to still allow training this similar to tiny-cuda-nn which uses provides the following

tiny-cuda-nn warning: FullyFusedMLP is not supported for the selected architecture 61. Falling back to CutlassMLP. For maximum performance, raise the target GPU architecture to 75+.
tiny-cuda-nn warning: FullyFusedMLP is not supported for the selected architecture 61. Falling back to CutlassMLP. For maximum performance, raise the target GPU architecture to 75+.
tiny-cuda-nn warning: FullyFusedMLP is not supported for the selected architecture 61. Falling back to CutlassMLP. For maximum performance, raise the target GPU architecture to 75+.
tiny-cuda-nn warning: FullyFusedMLP is not supported for the selected architecture 61. Falling back to CutlassMLP. For maximum performance, raise the target GPU architecture to 75+.
tiny-cuda-nn warning: FullyFusedMLP is not supported for the selected architecture 61. Falling back to CutlassMLP. For maximum performance, raise the target GPU architecture to 75+.
tiny-cuda-nn warning: FullyFusedMLP is not supported for the selected architecture 61. Falling back to CutlassMLP. For maximum performance, raise the target GPU architecture to 75+.
tiny-cuda-nn warning: FullyFusedMLP is not supported for the selected architecture 61. Falling back to CutlassMLP. For maximum performance, raise the target GPU architecture to 75+.

This is the error that I'm getting with the later version of gsplat

[10:08:33] Caching / undistorting eval images                                             full_images_datamanager.py:230
[NOTE] Not running eval iterations since only viewer is enabled.
Use --vis {wandb, tensorboard, viewer+wandb, viewer+tensorboard} to run with eval.
No Nerfstudio checkpoint to load, so training from scratch.
Disabled comet/tensorboard/wandb event writers
[10:08:39] Caching / undistorting train images                                            full_images_datamanager.py:230
Printing profiling stats, from longest to shortest duration in seconds
Trainer.train_iteration: 9.2314              
VanillaPipeline.get_train_loss_dict: 9.2307              

----------------------------------------------------------------------------------------------------------------------------------------------------------------------
ns-train 8 <module>
sys.exit(entrypoint())

train.py 262 entrypoint
main(

train.py 247 main
launch(

train.py 189 launch
main_func(local_rank=0, world_size=world_size, config=config)

train.py 100 train_loop
trainer.train()

trainer.py 266 train
loss, loss_dict, metrics_dict = self.train_iteration(step)

profiler.py 111 inner
out = func(*args, **kwargs)

trainer.py 501 train_iteration
_, loss_dict, metrics_dict = self.pipeline.get_train_loss_dict(step=step)

profiler.py 111 inner
out = func(*args, **kwargs)

base_pipeline.py 300 get_train_loss_dict
model_outputs = self._model(ray_bundle)  # train distributed data parallel model if world_size > 1

module.py 1511 _wrapped_call_impl
return self._call_impl(*args, **kwargs)

module.py 1520 _call_impl
return forward_call(*args, **kwargs)

base_model.py 143 forward
return self.get_outputs(ray_bundle)

splatfacto.py 794 get_outputs
viewmat = get_viewmat(optimized_camera_to_world)

eval_frame.py 489 _fn
return fn(*args, **kwargs)

eval_frame.py 655 catch_errors
return callback(frame, cache_entry, hooks, frame_state)

convert_frame.py 727 _convert_frame
result = inner_convert(frame, cache_entry, hooks, frame_state)

convert_frame.py 383 _convert_frame_assert
compiled_product = _compile(

convert_frame.py 646 _compile
guarded_code = compile_inner(code, one_graph, hooks, transform)

utils.py 244 time_wrapper
r = func(*args, **kwargs)

convert_frame.py 562 compile_inner
out_code = transform_code_object(code, transform)

bytecode_transformation.py 1033 transform_code_object
transformations(instructions, code_options)

convert_frame.py 151 _fn
return fn(*args, **kwargs)

convert_frame.py 527 transform
tracer.run()

symbolic_convert.py 2128 run
super().run()

symbolic_convert.py 818 run
and self.step()

symbolic_convert.py 781 step
getattr(self, inst.opname)(inst)

symbolic_convert.py 2243 RETURN_VALUE
self.output.compile_subgraph(

output_graph.py 919 compile_subgraph
self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)

contextlib.py 79 inner
return func(*args, **kwds)

output_graph.py 1087 compile_and_call_fx_graph
compiled_fn = self.call_user_compiler(gm)

utils.py 244 time_wrapper
r = func(*args, **kwargs)

output_graph.py 1159 call_user_compiler
raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(

output_graph.py 1140 call_user_compiler
compiled_fn = compiler_fn(gm, self.example_inputs())

after_dynamo.py 117 debug_wrapper
compiled_gm = compiler_fn(gm, example_inputs)

__init__.py 1668 __call__
return compile_fx(model_, inputs_, config_patches=self.config)

compile_fx.py 1168 compile_fx
return aot_autograd(

common.py 55 compiler_fn
cg = aot_module_simplified(gm, example_inputs, **kwargs)

aot_autograd.py 887 aot_module_simplified
compiled_fn = create_aot_dispatcher_function(

utils.py 244 time_wrapper
r = func(*args, **kwargs)

aot_autograd.py 600 create_aot_dispatcher_function
compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)

runtime_wrappers.py 425 aot_wrapper_dedupe
return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)

runtime_wrappers.py 630 aot_wrapper_synthetic_base
return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)

jit_compile_runtime_wrappers.py 97 aot_dispatch_base
compiled_fw = compiler(fw_module, updated_flat_args)

utils.py 244 time_wrapper
r = func(*args, **kwargs)

compile_fx.py 1100 fw_compiler_base
return inner_compile(

after_aot.py 83 debug_wrapper
inner_compiled_fn = compiler_fn(gm, example_inputs)

debug.py 305 inner
return fn(*args, **kwargs)

contextlib.py 79 inner
return func(*args, **kwds)

compile_fx.py 320 compile_fx_inner
compiled_graph = fx_codegen_and_compile(

compile_fx.py 550 fx_codegen_and_compile
compiled_fn = graph.compile_to_fn()

graph.py 1116 compile_to_fn
return self.compile_to_module().call

utils.py 244 time_wrapper
r = func(*args, **kwargs)

graph.py 1066 compile_to_module
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()

graph.py 1041 codegen
self.scheduler = Scheduler(self.buffers)

utils.py 244 time_wrapper
r = func(*args, **kwargs)

scheduler.py 1198 __init__
self.nodes = [self.create_scheduler_node(n) for n in nodes]

scheduler.py 1198 <listcomp>
self.nodes = [self.create_scheduler_node(n) for n in nodes]

scheduler.py 1289 create_scheduler_node
group_fn = self.get_backend(node.get_device()).group_fn

scheduler.py 2154 get_backend
self.backends[device] = self.create_backend(device)

scheduler.py 2142 create_backend
raise RuntimeError(

torch._dynamo.exc.BackendCompilerFailed:
backend='inductor' raised:
RuntimeError: Found NVIDIA GeForce GTX 1080 Ti which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability 6.1

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information

You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True

I also want to point out that the original inra code and rasterizer both still work on my 1080ti

deanziyangyu commented 2 months ago

Seconding this. We have multiple Compute Capability == 6.1 GPUs that could benefit from the multi-gpu training features in gsplat v1.

liruilong940607 commented 2 months ago

Yes gsplat>=1.0 requires GPU with compute capability >= 7.0, due to the usage of cg::labeled_partition https://github.com/nerfstudio-project/gsplat/blob/da0a201b8eafacb127fd8f09c56f2989b453a9ab/gsplat/cuda/csrc/world_to_cam_bwd.cu#L80-L83

Work-around solution is welcomed!

pablovela5620 commented 2 months ago

Unfortunately, I know zero cuda, I can try to see if I can hack my way to an implementation using ChatGPT. But I would need some guidance if someone with more cuda experience is willing. Is cg::labeled_partition the only thing holding back the use of sub 7.0 gpus?

So far this is what I've come up with

#if __CUDA_ARCH__ >= 700
// Write out results with warp-level reduction
auto warp = cg::tiled_partition<32>(cg::this_thread_block());
auto warp_group_g = cg::labeled_partition(warp, gid);

if (v_means != nullptr) {
    warpSum(v_mean, warp_group_g);
    if (warp_group_g.thread_rank() == 0) {
        v_means += gid * 3;
        GSPLAT_PRAGMA_UNROLL
        for (uint32_t i = 0; i < 3; i++) {
            gpuAtomicAdd(v_means + i, v_mean[i]);
        }
    }
}
// Similar code for v_covars and v_viewmats
#endif
#else
// Alternative implementation using shared memory and atomic operations
if (v_means != nullptr) {
    // Allocate shared memory for reduction
    __shared__ vec3<OpT> shared_v_mean[BLOCK_SIZE]; // BLOCK_SIZE is the number of threads per block
    shared_v_mean[threadIdx.x] = v_mean;

    __syncthreads();

    // Perform block-level reduction
    for (unsigned int stride = BLOCK_SIZE / 2; stride > 0; stride >>= 1) {
        if (threadIdx.x < stride) {
            shared_v_mean[threadIdx.x] += shared_v_mean[threadIdx.x + stride];
        }
        __syncthreads();
    }

    if (threadIdx.x == 0) {
        v_means += gid * 3;
        GSPLAT_PRAGMA_UNROLL
        for (uint32_t i = 0; i < 3; i++) {
            gpuAtomicAdd(v_means + i, shared_v_mean[0][i]);
        }
    }
}
// Similar code for v_covars and v_viewmats
#endif
liruilong940607 commented 2 months ago

I think labeled_partition is the only thing that holding it back. But it should require much more changes than this at multiple places where this is being used.

Feel feel to test your ideas locally and see if it passes the tests (pytest tests/). We are happy to accept a PR about it if anyone comes out a fix.

pablovela5620 commented 2 months ago

Got it, will see if I can get this working!

ZCB-endeavor commented 1 month ago

Hi,I got the same problem. Have you solve the problem yet?

pablovela5620 commented 1 month ago

Unfortunately not, I did some initial digging but haven't had time to come up with a fix