Repro (you may need to remove files cached under /tmp/torchinductor_USERID/ to reproduce):
python benchmarks/dynamo/torchbench.py --training --accuracy --device cuda --inductor --amp --only dlrm
Error:
"""
Traceback (most recent call last):
File "/fsx/users/binbao/conda/envs/py3.9/lib/python3.9/concurrent/futures/process.py", line 246, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/scratch/binbao/work/pytorch/torch/_inductor/codecache.py", line 488, in _worker_compile
kernel.precompile(warm_cache_only_with_cc=cc)
File "/scratch/binbao/work/pytorch/torch/_inductor/triton_ops/autotune.py", line 59, in precompile
self.launchers = [
File "/scratch/binbao/work/pytorch/torch/_inductor/triton_ops/autotune.py", line 60, in <listcomp>
self._precompile_config(c, warm_cache_only_with_cc)
File "/scratch/binbao/work/pytorch/torch/_inductor/triton_ops/autotune.py", line 74, in _precompile_config
triton.compile(
File "/fsx/users/binbao/conda/envs/py3.9/lib/python3.9/site-packages/triton/compiler.py", line 1256, in compile
asm, shared, kernel_name = _compile(fn, signature, device, constants, configs[0], num_warps, num_stages,
File "/fsx/users/binbao/conda/envs/py3.9/lib/python3.9/site-packages/triton/compiler.py", line 901, in _compile
name, asm, shared_mem = _triton.code_gen.compile_ttir(backend, module, device, num_warps, num_stages, extern_libs, cc)
RuntimeError: Internal Triton PTX codegen error:
Segmentation fault (core dumped)
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/scratch/binbao/work/pytorch/benchmarks/dynamo/common.py", line 1167, in check_accuracy
new_result = optimized_model_iter_fn(model_copy, example_inputs)
File "/scratch/binbao/work/pytorch/torch/_dynamo/eval_frame.py", line 211, in _fn
return fn(*args, **kwargs)
File "/scratch/binbao/work/pytorch/benchmarks/dynamo/common.py", line 1050, in run_n_iterations
self.model_iter_fn(mod, inputs, collect_outputs=False)
File "/scratch/binbao/work/pytorch/benchmarks/dynamo/torchbench.py", line 356, in forward_and_backward_pass
cloned_inputs = clone_inputs(inputs)
File "/scratch/binbao/work/pytorch/benchmarks/dynamo/torchbench.py", line 357, in <graph break in forward_and_backward_pass>
self.optimizer_zero_grad(mod)
File "/scratch/binbao/work/pytorch/benchmarks/dynamo/torchbench.py", line 358, in <graph break in forward_and_backward_pass>
with self.autocast():
File "/scratch/binbao/work/pytorch/benchmarks/dynamo/torchbench.py", line 361, in <graph break in forward_and_backward_pass>
self.grad_scaler.scale(loss).backward()
File "/scratch/binbao/work/pytorch/torch/_tensor.py", line 484, in backward
torch.autograd.backward(
File "/scratch/binbao/work/pytorch/torch/autograd/__init__.py", line 197, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/scratch/binbao/work/pytorch/torch/autograd/function.py", line 272, in apply
return user_fn(self, *args)
File "/scratch/binbao/work/pytorch/torch/_functorch/aot_autograd.py", line 1683, in backward
CompiledFunction.compiled_bw = aot_config.bw_compiler(
File "/scratch/binbao/work/pytorch/torch/_dynamo/optimizations/training.py", line 68, in _wrapped_bw_compiler
return eval_frame.disable(eval_frame.disable(bw_compiler)(*args, **kwargs))
File "/scratch/binbao/work/pytorch/torch/_dynamo/eval_frame.py", line 211, in _fn
return fn(*args, **kwargs)
File "/scratch/binbao/work/pytorch/torch/_dynamo/utils.py", line 90, in time_wrapper
r = func(*args, **kwargs)
File "/scratch/binbao/work/pytorch/torch/_inductor/compile_fx.py", line 380, in bw_compiler
return inner_compile(
File "/scratch/binbao/work/pytorch/torch/_dynamo/debug_utils.py", line 490, in debug_wrapper
compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
File "/scratch/binbao/work/pytorch/torch/_inductor/debug.py", line 223, in inner
return fn(*args, **kwargs)
File "/fsx/users/binbao/conda/envs/py3.9/lib/python3.9/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/scratch/binbao/work/pytorch/torch/_inductor/compile_fx.py", line 136, in compile_fx_inner
compiled_fn = graph.compile_to_fn()
File "/scratch/binbao/work/pytorch/torch/_inductor/graph.py", line 503, in compile_to_fn
return self.compile_to_module().call
File "/scratch/binbao/work/pytorch/torch/_dynamo/utils.py", line 90, in time_wrapper
r = func(*args, **kwargs)
File "/scratch/binbao/work/pytorch/torch/_inductor/graph.py", line 492, in compile_to_module
mod = PyCodeCache.load(code)
File "/scratch/binbao/work/pytorch/torch/_inductor/codecache.py", line 459, in load
exec(code, mod.__dict__, mod.__dict__)
File "/tmp/torchinductor_binbao/lw/clwubq5pyniwu7wzfxdmxrouhbmyo3g3zaw4iz2vahq32zo47jgo.py", line 658, in <module>
async_compile.wait(globals())
File "/scratch/binbao/work/pytorch/torch/_inductor/codecache.py", line 642, in wait
scope[key] = result.result()
File "/scratch/binbao/work/pytorch/torch/_inductor/codecache.py", line 512, in result
self.future.result()
File "/fsx/users/binbao/conda/envs/py3.9/lib/python3.9/concurrent/futures/_base.py", line 446, in result
return self.__get_result()
File "/fsx/users/binbao/conda/envs/py3.9/lib/python3.9/concurrent/futures/_base.py", line 391, in __get_result
raise self._exception
RuntimeError: Internal Triton PTX codegen error:
Segmentation fault (core dumped)
Repro (you may need to remove files cached under
/tmp/torchinductor_USERID/
to reproduce):Error: