Open hokiyoshi opened 3 months ago
I have an error with 'mamba_split_conv1d_scan_combined' fuction:
Traceback (most recent call last): File "/home/thientan/desktop/mamba2.py", line 16, in y = model(x) ^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/mamba_ssm/modules/mamba2.py", line 183, in forward out = mamba_split_conv1d_scan_combined( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 930, in mamba_split_conv1d_scan_combined return MambaSplitConv1dScanCombinedFn.apply(zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states, seq_idx, dt_limit, return_final_states, activation, rmsnorm_weight, rmsnorm_eps, outproj_weight, outproj_bias, headdim, ngroups, norm_before_gate) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/torch/autograd/function.py", line 574, in apply return super().apply(args, kwargs) # type: ignore[misc] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 455, in decorate_fwd return fwd(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 795, in forward outx, , dt_out, dA_cumsum, states, final_states = _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size=chunk_size, D=D, z=None, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, dt_softplus=True, dt_limit=dt_limit) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 313, in _mamba_chunk_scan_combined_fwd states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/mamba_ssm/ops/triton/ssd_chunk_state.py", line 746, in _chunk_state_fwd _chunk_state_fwd_kernel[grid]( File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/runtime/jit.py", line 326, in return lambda *args, *kwargs: self.run(grid=grid, warmup=False, args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/runtime/autotuner.py", line 156, in run timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs} ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/runtime/autotuner.py", line 133, in _bench return do_bench(kernel_call, warmup=self.num_warmups, rep=self.num_reps, quantiles=(0.5, 0.2, 0.8)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/testing.py", line 107, in do_bench fn() File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/runtime/autotuner.py", line 114, in kernel_call self.fn.run( File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/runtime/jit.py", line 643, in run kernel = self.compile( ^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/compiler/compiler.py", line 287, in compile next_module = compile_ir(module, metadata) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py", line 330, in stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, self.capability) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py", line 228, in make_llir pm.run(mod) IndexError: map::at
My GPU: 2080ti, Trition: 3.0.0. Mamba-ssm: 2.2.2 Cuda: 12.2
May anyone help me ?
I've met this problem when swapping to triton 3.0.0 (while the mamba-ssm is installed with -e flag and with triton 2.1.0), swap back to 2.1.0 fixed the issue for me.
I have an error with 'mamba_split_conv1d_scan_combined' fuction:
Traceback (most recent call last): File "/home/thientan/desktop/mamba2.py", line 16, in
y = model(x)
^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, *kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/mamba_ssm/modules/mamba2.py", line 183, in forward
out = mamba_split_conv1d_scan_combined(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 930, in mamba_split_conv1d_scan_combined
return MambaSplitConv1dScanCombinedFn.apply(zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states, seq_idx, dt_limit, return_final_states, activation, rmsnorm_weight, rmsnorm_eps, outproj_weight, outproj_bias, headdim, ngroups, norm_before_gate)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/torch/autograd/function.py", line 574, in apply
return super().apply(args, kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 455, in decorate_fwd
return fwd(*args, kwargs)
^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 795, in forward
outx, , dt_out, dA_cumsum, states, final_states = _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size=chunk_size, D=D, z=None, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, dt_softplus=True, dt_limit=dt_limit)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 313, in _mamba_chunk_scan_combined_fwd
states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/mamba_ssm/ops/triton/ssd_chunk_state.py", line 746, in _chunk_state_fwd
_chunk_state_fwd_kernel[grid](
File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/runtime/jit.py", line 326, in
return lambda *args, *kwargs: self.run(grid=grid, warmup=False, args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/runtime/autotuner.py", line 156, in run
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/runtime/autotuner.py", line 133, in _bench
return do_bench(kernel_call, warmup=self.num_warmups, rep=self.num_reps, quantiles=(0.5, 0.2, 0.8))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/testing.py", line 107, in do_bench
fn()
File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/runtime/autotuner.py", line 114, in kernel_call
self.fn.run(
File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/runtime/jit.py", line 643, in run
kernel = self.compile(
^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/compiler/compiler.py", line 287, in compile
next_module = compile_ir(module, metadata)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py", line 330, in
stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, self.capability)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/thientan/anaconda3/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py", line 228, in make_llir
pm.run(mod)
IndexError: map::at
My GPU: 2080ti, Trition: 3.0.0. Mamba-ssm: 2.2.2 Cuda: 12.2
May anyone help me ?