Correctness regression: dlrm

Repro (you may need to remove files cached under /tmp/torchinductor_USERID/ to reproduce):

python benchmarks/dynamo/torchbench.py --training --accuracy --device cuda --inductor --amp --only dlrm

Error:

"""
Traceback (most recent call last):
  File "/fsx/users/binbao/conda/envs/py3.9/lib/python3.9/concurrent/futures/process.py", line 246, in _process_worker
    r = call_item.fn(*call_item.args, **call_item.kwargs)
  File "/scratch/binbao/work/pytorch/torch/_inductor/codecache.py", line 488, in _worker_compile
    kernel.precompile(warm_cache_only_with_cc=cc)
  File "/scratch/binbao/work/pytorch/torch/_inductor/triton_ops/autotune.py", line 59, in precompile
    self.launchers = [
  File "/scratch/binbao/work/pytorch/torch/_inductor/triton_ops/autotune.py", line 60, in <listcomp>
    self._precompile_config(c, warm_cache_only_with_cc)
  File "/scratch/binbao/work/pytorch/torch/_inductor/triton_ops/autotune.py", line 74, in _precompile_config
    triton.compile(
  File "/fsx/users/binbao/conda/envs/py3.9/lib/python3.9/site-packages/triton/compiler.py", line 1256, in compile
    asm, shared, kernel_name = _compile(fn, signature, device, constants, configs[0], num_warps, num_stages,
  File "/fsx/users/binbao/conda/envs/py3.9/lib/python3.9/site-packages/triton/compiler.py", line 901, in _compile
    name, asm, shared_mem = _triton.code_gen.compile_ttir(backend, module, device, num_warps, num_stages, extern_libs, cc)
RuntimeError: Internal Triton PTX codegen error: 
Segmentation fault (core dumped)

"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/scratch/binbao/work/pytorch/benchmarks/dynamo/common.py", line 1167, in check_accuracy
    new_result = optimized_model_iter_fn(model_copy, example_inputs)
  File "/scratch/binbao/work/pytorch/torch/_dynamo/eval_frame.py", line 211, in _fn
    return fn(*args, **kwargs)
  File "/scratch/binbao/work/pytorch/benchmarks/dynamo/common.py", line 1050, in run_n_iterations
    self.model_iter_fn(mod, inputs, collect_outputs=False)
  File "/scratch/binbao/work/pytorch/benchmarks/dynamo/torchbench.py", line 356, in forward_and_backward_pass
    cloned_inputs = clone_inputs(inputs)
  File "/scratch/binbao/work/pytorch/benchmarks/dynamo/torchbench.py", line 357, in <graph break in forward_and_backward_pass>
    self.optimizer_zero_grad(mod)
  File "/scratch/binbao/work/pytorch/benchmarks/dynamo/torchbench.py", line 358, in <graph break in forward_and_backward_pass>
    with self.autocast():
  File "/scratch/binbao/work/pytorch/benchmarks/dynamo/torchbench.py", line 361, in <graph break in forward_and_backward_pass>
    self.grad_scaler.scale(loss).backward()
  File "/scratch/binbao/work/pytorch/torch/_tensor.py", line 484, in backward
    torch.autograd.backward(
  File "/scratch/binbao/work/pytorch/torch/autograd/__init__.py", line 197, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  File "/scratch/binbao/work/pytorch/torch/autograd/function.py", line 272, in apply
    return user_fn(self, *args)
  File "/scratch/binbao/work/pytorch/torch/_functorch/aot_autograd.py", line 1683, in backward
    CompiledFunction.compiled_bw = aot_config.bw_compiler(
  File "/scratch/binbao/work/pytorch/torch/_dynamo/optimizations/training.py", line 68, in _wrapped_bw_compiler
    return eval_frame.disable(eval_frame.disable(bw_compiler)(*args, **kwargs))
  File "/scratch/binbao/work/pytorch/torch/_dynamo/eval_frame.py", line 211, in _fn
    return fn(*args, **kwargs)
  File "/scratch/binbao/work/pytorch/torch/_dynamo/utils.py", line 90, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/binbao/work/pytorch/torch/_inductor/compile_fx.py", line 380, in bw_compiler
    return inner_compile(
  File "/scratch/binbao/work/pytorch/torch/_dynamo/debug_utils.py", line 490, in debug_wrapper
    compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
  File "/scratch/binbao/work/pytorch/torch/_inductor/debug.py", line 223, in inner
    return fn(*args, **kwargs)
  File "/fsx/users/binbao/conda/envs/py3.9/lib/python3.9/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/scratch/binbao/work/pytorch/torch/_inductor/compile_fx.py", line 136, in compile_fx_inner
    compiled_fn = graph.compile_to_fn()
  File "/scratch/binbao/work/pytorch/torch/_inductor/graph.py", line 503, in compile_to_fn
    return self.compile_to_module().call
  File "/scratch/binbao/work/pytorch/torch/_dynamo/utils.py", line 90, in time_wrapper
    r = func(*args, **kwargs)
  File "/scratch/binbao/work/pytorch/torch/_inductor/graph.py", line 492, in compile_to_module
    mod = PyCodeCache.load(code)
  File "/scratch/binbao/work/pytorch/torch/_inductor/codecache.py", line 459, in load
    exec(code, mod.__dict__, mod.__dict__)
  File "/tmp/torchinductor_binbao/lw/clwubq5pyniwu7wzfxdmxrouhbmyo3g3zaw4iz2vahq32zo47jgo.py", line 658, in <module>
    async_compile.wait(globals())
  File "/scratch/binbao/work/pytorch/torch/_inductor/codecache.py", line 642, in wait
    scope[key] = result.result()
  File "/scratch/binbao/work/pytorch/torch/_inductor/codecache.py", line 512, in result
    self.future.result()
  File "/fsx/users/binbao/conda/envs/py3.9/lib/python3.9/concurrent/futures/_base.py", line 446, in result
    return self.__get_result()
  File "/fsx/users/binbao/conda/envs/py3.9/lib/python3.9/concurrent/futures/_base.py", line 391, in __get_result
    raise self._exception
RuntimeError: Internal Triton PTX codegen error: 
Segmentation fault (core dumped)

pytorch / torchdynamo

Correctness regression: dlrm #1984