Open Sine7812 opened 6 months ago
I have the same issue. Does anyone have a solution? Help is greatly appreciated.
anybody meet the problem? anyone got solved?
Same problem here
Found the solution. Replace this :
def _layer_norm_fwd(
x, weight, bias, eps, residual=None, out_dtype=None, residual_dtype=None, is_rms_norm=False
):
if residual is not None:
residual_dtype = residual.dtype
M, N = x.shape
assert x.stride(-1) == 1
if residual is not None:
assert residual.stride(-1) == 1
assert residual.shape == (M, N)
assert weight.shape == (N,)
assert weight.stride(-1) == 1
if bias is not None:
assert bias.stride(-1) == 1
assert bias.shape == (N,)
# allocate output
y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
assert y.stride(-1) == 1
if residual is not None or (residual_dtype is not None and residual_dtype != x.dtype):
residual_out = torch.empty(M, N, device=x.device, dtype=residual_dtype)
assert residual_out.stride(-1) == 1
else:
residual_out = None
mean = torch.empty((M,), dtype=torch.float32, device="cuda") if not is_rms_norm else None
rstd = torch.empty((M,), dtype=torch.float32, device="cuda")
# Less than 64KB per feature: enqueue fused kernel
MAX_FUSED_SIZE = 65536 // x.element_size()
BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
if N > BLOCK_N:
raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
# heuristics for number of warps
with torch.cuda.device(x.device.index):
_layer_norm_fwd_1pass_kernel[(M,)](
x,
y,
weight,
bias,
residual,
residual_out,
mean,
rstd,
x.stride(0),
y.stride(0),
residual.stride(0) if residual is not None else 0,
residual_out.stride(0) if residual_out is not None else 0,
N,
eps,
is_rms_norm,
BLOCK_N,
residual is not None,
residual_out is not None,
bias is not None,
)
# residual_out is None if residual is None and residual_dtype == input_dtype
return y, mean, rstd, residual_out if residual_out is not None else x
by
def _layer_norm_fwd(
x, weight, bias, eps, residual=None, out_dtype=None, residual_dtype=None, is_rms_norm=False
):
if residual is not None:
residual_dtype = residual.dtype
M, N = x.shape
assert x.stride(-1) == 1
if residual is not None:
assert residual.stride(-1) == 1
assert residual.shape == (M, N)
assert weight.shape == (N,)
assert weight.stride(-1) == 1
if bias is not None:
assert bias.stride(-1) == 1
assert bias.shape == (N,)
# allocate output
y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
assert y.stride(-1) == 1
if residual is not None or (residual_dtype is not None and residual_dtype != x.dtype):
residual_out = torch.empty(M, N, device=x.device, dtype=residual_dtype)
assert residual_out.stride(-1) == 1
else:
residual_out = None
mean = torch.empty((M,), dtype=torch.float32, device="cuda") if not is_rms_norm else None
rstd = torch.empty((M,), dtype=torch.float32, device="cuda")
# Less than 64KB per feature: enqueue fused kernel
MAX_FUSED_SIZE = 65536 // x.element_size()
BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
if N > BLOCK_N:
raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
# heuristics for number of warps
with torch.cuda.device(x.device.index):
_layer_norm_fwd_1pass_kernel[(M,)](
x,
y,
weight,
bias,
residual,
residual_out,
mean,
rstd,
x.stride(0),
y.stride(0),
residual.stride(0) if residual is not None else 0,
residual_out.stride(0) if residual_out is not None else 0,
int(N),
eps,
is_rms_norm,
int(BLOCK_N),
residual is not None,
residual_out is not None,
bias is not None,
)
# residual_out is None if residual is None and residual_dtype == input_dtype
return y, mean, rstd, residual_out if residual_out is not None else x
= cast both N and BLOCK_N to int. The error shown by OP only points to casting BLOCK_N to int, but if you only did that, you'd see another error, which is solved by also casting N to int.
Sorry : lemme add this though : even with these corrections, I still couldn't export to onnx without corrupting the model. Those lines will only allow the torch.jit.trace call. Weirdly, the traced model is not corrupt, but the onnx one is for me.
@SophieOstmeier @zengjie617789 Have you solved this problem? It has been bothering me for days.
Does anyone know how to view the model's architecture diagram? Whether it's using TensorBoard or attempting to convert to ONNX format, I'm encountering errors.Traceback (most recent call last): File "/mnt/e/code/VideoMamba-main/videomamba/video_sm/run_class_finetuning.py", line 744, in
main(opts, ds_init)
File "/mnt/e/code/VideoMamba-main/videomamba/video_sm/run_class_finetuning.py", line 674, in main
save_model_and_onnx(args, model_without_ddp, epoch, model_ema)
File "/mnt/e/code/VideoMamba-main/videomamba/video_sm/run_class_finetuning.py", line 249, in save_model_and_onnx
timings = {config: self._bench(*args, config=config, *kwargs)
File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 83, in _bench", line 63, in _layer_norm_fwd_1pass_kernel
File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/triton/compiler/compiler.py", line 476, in compile
lambda src: optimize_ttir(ast_to_ttir(src, signature, configs[0], constants, debug=debug, arch=arch), arch))
torch.onnx.export(model, dummy_input, onnx_path, export_params=True, opset_version=11, File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/onnx/utils.py", line 516, in export _export( File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/onnx/utils.py", line 1596, in _export graph, params_dict, torch_out = _model_to_graph( File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/onnx/utils.py", line 1135, in _model_to_graph graph, params, torch_out, module = _create_jit_graph(model, args) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/onnx/utils.py", line 1011, in _create_jit_graph graph, torch_out = _trace_and_get_graph_from_model(model, args) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/onnx/utils.py", line 915, in _trace_and_get_graph_from_model trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph( File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/jit/_trace.py", line 1285, in _get_trace_graph outs = ONNXTracedModule( File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, kwargs) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(*args, kwargs) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/jit/_trace.py", line 133, in forward graph, out = torch._C._create_graph_by_tracing( File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/jit/_trace.py", line 124, in wrapper outs.append(self.inner(trace_inputs)) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(args, kwargs) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(*args, kwargs) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1508, in _slow_forward result = self.forward(*input, kwargs) File "/mnt/e/code/VideoMamba-main/videomamba/video_sm/models/videomamba.py", line 417, in forward x = self.forward_features(x, inference_params) File "/mnt/e/code/VideoMamba-main/videomamba/video_sm/models/videomamba.py", line 390, in forward_features hidden_states, residual = layer( File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, *kwargs) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(args, kwargs) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1508, in _slow_forward result = self.forward(*input, kwargs) File "/mnt/e/code/VideoMamba-main/videomamba/video_sm/models/videomamba.py", line 123, in forward hidden_states, residual = fused_add_norm_fn( File "/mnt/e/code/VideoMamba-main/mamba/mamba_ssm/ops/triton/layernorm.py", line 478, in rms_norm_fn return LayerNormFn.apply(x, weight, bias, residual, eps, prenorm, residual_in_fp32, True) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/torch/autograd/function.py", line 539, in apply
return super().apply(*args, *kwargs) # type: ignore[misc] File "/mnt/e/code/VideoMamba-main/mamba/mamba_ssm/ops/triton/layernorm.py", line 411, in forward y, mean, rstd, residual_out = _layer_norm_fwd( File "/mnt/e/code/VideoMamba-main/mamba/mamba_ssm/ops/triton/layernorm.py", line 155, in _layer_norm_fwd _layer_norm_fwd_1pass_kernel[(M,)]( File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 100, in run
timings = {config: self._bench(args, config=config, kwargs) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 100, in
return do_bench(kernel_call, warmup=self.warmup, rep=self.rep, quantiles=(0.5, 0.2, 0.8)) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/triton/testing.py", line 104, in do_bench fn() File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 81, in kernel_call self.fn.run(args, num_warps=config.num_warps, num_stages=config.num_stages, **current) File "
next_module = compile_kernel(module) File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/triton/compiler/compiler.py", line 381, in
File "/home/jcz/anaconda3/envs/mamba/lib/python3.10/site-packages/triton/compiler/code_generator.py", line 1133, in ast_to_ttir raise CompilationError(fn.src, node, repr(e)) from e triton.compiler.errors.CompilationError: at 31:24: HAS_BIAS: tl.constexpr, ):
Map the program id to the row of X and Y it should compute.
ValueError("arange's arguments must be of type tl.constexpr")