Open lya19971103 opened 3 months ago
Please use triton >= 2.1.0
same issue, triton version==2.1.0, torch=2.0.1, cuda11.6
#
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/opt/conda/lib/python3.10/site-packages/mamba_ssm/modules/mamba2.py", line 176, in forward
out = mamba_split_conv1d_scan_combined(
File "/opt/conda/lib/python3.10/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 908, in mamba_split_conv1d_scan_combined
return MambaSplitConv1dScanCombinedFn.apply(zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states, seq_idx, dt_limit, return_final_states, activation, rmsnorm_weight, rmsnorm_eps, outproj_weight, outproj_bias, headdim, ngroups, norm_before_gate)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, *kwargs) # type: ignore[misc]
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 98, in decorate_fwd
return fwd(args, kwargs)
File "/opt/conda/lib/python3.10/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 773, in forward
outx, , dt_out, dA_cumsum, states, final_states = _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size=chunk_size, D=D, z=None, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, dt_softplus=True, dt_limit=dt_limit)
File "/opt/conda/lib/python3.10/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 307, in _mamba_chunk_scan_combined_fwd
dA_cumsum, dt = _chunk_cumsum_fwd(dt, A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit)
File "/opt/conda/lib/python3.10/site-packages/mamba_ssm/ops/triton/ssd_chunk_state.py", line 582, in _chunk_cumsum_fwd
_chunk_cumsum_fwd_kernel[grid_chunk_cs](
File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 77, in run
timings = {config: self._bench(*args, config=config, kwargs)
File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 77, in
triton 2.1.0 should have cumsum. If not you can try >= 2.2.0
Hi,I miss a question that my framework needs torch==2.0.1, but triton>=2.0.0 will be incompatible, is there a way to solve it?
No we require tl.cumsum
@Yyc1999super I'm running torch v2.0.1 with triton v2.3.0 with no issue. Just install triton after torch and you should be fine. Pip will complain but install, afaik this is just a legacy issue of torch.
@Yyc1999super I'm running torch v2.0.1 with triton v2.3.0 with no issue. Just install triton after torch and you should be fine. Pip will complain but install, afaik this is just a legacy issue of torch.
Okay, Thank you. I will try it again.
torch2.0.1, cuda11.6, triton2.3.0 Triton Error [CUDA]: device kernel image is invalid
Hi, I miss an error as "AttributeError: module 'triton.language' has no attribute 'cumsum'". Request a solution!. Thank you very much!
The details are as follows:
File "/root/miniconda3/lib/python3.8/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 307, in _mamba_chunk_scan_combined_fwd dA_cumsum, dt = _chunk_cumsum_fwd(dt, A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit) File "/root/miniconda3/lib/python3.8/site-packages/mamba_ssm/ops/triton/ssd_chunk_state.py", line 582, in _chunk_cumsum_fwd _chunk_cumsum_fwd_kernel[grid_chunk_cs]( File "/root/miniconda3/lib/python3.8/site-packages/triton/runtime/autotuner.py", line 77, in run timings = {config: self._bench(*args, config=config, kwargs) File "/root/miniconda3/lib/python3.8/site-packages/triton/runtime/autotuner.py", line 77, in
timings = {config: self._bench(*args, config=config, *kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/triton/runtime/autotuner.py", line 65, in _bench
return do_bench(kernel_call)
File "/root/miniconda3/lib/python3.8/site-packages/triton/testing.py", line 143, in do_bench
fn()
File "/root/miniconda3/lib/python3.8/site-packages/triton/runtime/autotuner.py", line 63, in kernel_call
self.fn.run(args, num_warps=config.num_warps, num_stages=config.num_stages, current)
File "", line 41, in _chunk_cumsum_fwd_kernel
File "/root/miniconda3/lib/python3.8/site-packages/triton/compiler.py", line 1589, in compile
fn_cache_manager = CacheManager(make_hash(fn, **kwargs))
File "/root/miniconda3/lib/python3.8/site-packages/triton/compiler.py", line 1499, in make_hash
key = f"{fn.cache_key}-{''.join(signature.values())}-{configs_key}-{constants}-{num_warps}-{num_stages}"
File "/root/miniconda3/lib/python3.8/site-packages/triton/runtime/jit.py", line 333, in cache_key
dependencies_finder.visit(self.parse())
File "/root/miniconda3/lib/python3.8/ast.py", line 371, in visit
return visitor(node)
File "/root/miniconda3/lib/python3.8/ast.py", line 379, in generic_visit
self.visit(item)
File "/root/miniconda3/lib/python3.8/ast.py", line 371, in visit
return visitor(node)
File "/root/miniconda3/lib/python3.8/ast.py", line 379, in generic_visit
self.visit(item)
File "/root/miniconda3/lib/python3.8/ast.py", line 371, in visit
return visitor(node)
File "/root/miniconda3/lib/python3.8/ast.py", line 381, in generic_visit
self.visit(value)
File "/root/miniconda3/lib/python3.8/ast.py", line 371, in visit
return visitor(node)
File "/root/miniconda3/lib/python3.8/site-packages/triton/runtime/jit.py", line 55, in visit_Call
func = self.visit(node.func)
File "/root/miniconda3/lib/python3.8/ast.py", line 371, in visit
return visitor(node)
File "/root/miniconda3/lib/python3.8/site-packages/triton/runtime/jit.py", line 52, in visit_Attribute
return getattr(lhs, node.attr)
AttributeError: module 'triton.language' has no attribute 'cumsum'