Unit testTORCHINDUCTOR_COMPILE_THREADS=1 python test/inductor/test_benchmark_fusion.py --verbose -k "test_avoid_register_spilling_cuda"
Reproducer:
import torch
import functools
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor
@triton.jit
def triton_mm(arg_A, arg_B, out_ptr0):
GROUP_M : tl.constexpr = 8
EVEN_K : tl.constexpr = True
ALLOW_TF32 : tl.constexpr = False
ACC_TYPE : tl.constexpr = tl.float32
B_PROLOGUE_CAST_TYPE : tl.constexpr = None
BLOCK_M : tl.constexpr = 64
BLOCK_N : tl.constexpr = 64
BLOCK_K : tl.constexpr = 32
matrix_instr_nonkdim : tl.constexpr = 0
A = arg_A
B = arg_B
M = 2048
N = 2048
K = 2048
if M * N == 0:
# early exit due to zero-size input(s)
return
stride_am = 2048
stride_ak = 1
stride_bk = 1
stride_bn = 2048
# based on triton.ops.matmul
pid = tl.program_id(0)
grid_m = (M + BLOCK_M - 1) // BLOCK_M
grid_n = (N + BLOCK_N - 1) // BLOCK_N
# re-order program ID for better L2 performance
width = GROUP_M * grid_n
group_id = pid // width
group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
pid_m = group_id * GROUP_M + (pid % group_size)
pid_n = (pid % width) // (group_size)
rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
rk = tl.arange(0, BLOCK_K)
A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
for k in range(K, 0, -BLOCK_K):
if EVEN_K:
a = tl.load(A)
b = tl.load(B)
else:
a = tl.load(A, mask=rk[None, :] < k, other=0.)
b = tl.load(B, mask=rk[:, None] < k, other=0.)
if B_PROLOGUE_CAST_TYPE is not None:
b = b.to(B_PROLOGUE_CAST_TYPE)
acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
A += BLOCK_K * stride_ak
B += BLOCK_K * stride_bk
# rematerialize rm and rn to save registers
rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
idx_m = rm[:, None]
idx_n = rn[None, :]
mask = (idx_m < M) & (idx_n < N)
# inductor generates a suffix
xindex = idx_n + (2048*idx_m)
tl.store(out_ptr0 + (tl.broadcast_to(xindex, mask.shape)), acc, mask)
run_method = triton_mm.run
tensor1 = torch.randn([2048, 2048], dtype=torch.float16).cuda()
tensor2 = torch.randn([2048, 2048], dtype=torch.float16).cuda()
input_tensors = tensor1, tensor2
output_tensor = torch.zeros([2048, 2048], dtype=torch.float16).cuda()
extra_args = ()
warmup_arg = {'warmup': False}
fn = functools.partial(
run_method,
*input_tensors,
output_tensor,
*extra_args,
grid=(1024, 1, 1),
**warmup_arg,
num_stages=1,
num_warps=4
)
triton.testing.do_bench(fn)
Traceback
loc("test.py":55:24): error: failed to legalize operation 'triton_gpu.local_load' that was explicitly marked illegal
Traceback (most recent call last):
File "test.py", line 97, in <module>
triton.testing.do_bench(fn)
File "/root/triton/python/triton/testing.py", line 100, in do_bench
fn()
File "/root/triton/python/triton/runtime/jit.py", line 401, in run
self.cache[device][key] = compile(
File "/root/triton/python/triton/compiler/compiler.py", line 268, in compile
next_module = compile_ir(module, metadata)
File "/root/triton/python/triton/backends/amd/compiler.py", line 223, in <lambda>
stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, 90)
File "/root/triton/python/triton/backends/amd/compiler.py", line 163, in make_llir
pm.run(mod)
RuntimeError: PassManager::run failed
Operating System
-
CPU
-
GPU
AMD Instinct MI250X
ROCm Version
ROCm 6.0.0
ROCm Component
No response
Steps to Reproduce
No response
(Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support
Problem Description
Environment: Docker image:
rocm/pytorch-private:rocm_inductor_triton_upstream_migration_v1
Triton branch:https://github.com/jataylo/triton/tree/jack-triton-inductor-migration
Pytorch branch:https://github.com/pytorch/pytorch/tree/rocm-inductor-hip-device
Unit test
TORCHINDUCTOR_COMPILE_THREADS=1 python test/inductor/test_benchmark_fusion.py --verbose -k "test_avoid_register_spilling_cuda"
Reproducer:
Traceback
Operating System
-
CPU
-
GPU
AMD Instinct MI250X
ROCm Version
ROCm 6.0.0
ROCm Component
No response
Steps to Reproduce
No response
(Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support
No response
Additional Information
No response