Closed bbradelTT closed 4 weeks ago
Tried local repro. Watcher output:
Dump #6 at 74.819s
Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): NWTW, W, W, W, W rmsg:D0G|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 1,y= 0) phys(x= 2,y= 1): NWTW, W, W, W, W rmsg:D0G|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 2,y= 0) phys(x= 3,y= 1): NWTW, W, W, W, W rmsg:D0G|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 3,y= 0) phys(x= 4,y= 1): NWTW, W, W, W, W rmsg:D0G|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 4,y= 0) phys(x= 5,y= 1): NWTW, W, W, W, W rmsg:D0G|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 5,y= 0) phys(x= 6,y= 1): NWTW, W, W, W, W rmsg:D0G|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 6,y= 0) phys(x= 7,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 7,y= 0) phys(x= 8,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 8,y= 0) phys(x= 9,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 9,y= 0) phys(x=10,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x=10,y= 0) phys(x=11,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x=11,y= 0) phys(x=12,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 0,y= 1) phys(x= 1,y= 2): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 1,y= 1) phys(x= 2,y= 2): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 1) phys(x= 3,y= 2): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 1) phys(x= 4,y= 2): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 1) phys(x= 5,y= 2): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 1) phys(x= 6,y= 2): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 1) phys(x= 7,y= 2): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 1) phys(x= 8,y= 2): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 1) phys(x= 9,y= 2): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 1) phys(x=10,y= 2): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 1) phys(x=11,y= 2): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 1) phys(x=12,y= 2): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 2) phys(x= 1,y= 3): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 2) phys(x= 2,y= 3): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 2) phys(x= 3,y= 3): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 2) phys(x= 4,y= 3): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 2) phys(x= 5,y= 3): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 2) phys(x= 6,y= 3): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 2) phys(x= 7,y= 3): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 2) phys(x= 8,y= 3): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 2) phys(x= 9,y= 3): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 2) phys(x=10,y= 3): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 2) phys(x=11,y= 3): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 2) phys(x=12,y= 3): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 3) phys(x= 1,y= 4): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 3) phys(x= 2,y= 4): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 3) phys(x= 3,y= 4): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 3) phys(x= 4,y= 4): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 3) phys(x= 5,y= 4): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 3) phys(x= 6,y= 4): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 3) phys(x= 7,y= 4): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 3) phys(x= 8,y= 4): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 3) phys(x= 9,y= 4): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 3) phys(x=10,y= 4): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 3) phys(x=11,y= 4): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 3) phys(x=12,y= 4): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 4) phys(x= 1,y= 5): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 4) phys(x= 2,y= 5): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 4) phys(x= 3,y= 5): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 4) phys(x= 4,y= 5): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 4) phys(x= 5,y= 5): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 4) phys(x= 6,y= 5): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 4) phys(x= 7,y= 5): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 4) phys(x= 8,y= 5): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 4) phys(x= 9,y= 5): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 4) phys(x=10,y= 5): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 4) phys(x=11,y= 5): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 4) phys(x=12,y= 5): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 5) phys(x= 1,y= 7): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 5) phys(x= 2,y= 7): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 5) phys(x= 3,y= 7): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 5) phys(x= 4,y= 7): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 5) phys(x= 5,y= 7): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 5) phys(x= 6,y= 7): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 5) phys(x= 7,y= 7): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 5) phys(x= 8,y= 7): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 5) phys(x= 9,y= 7): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 5) phys(x=10,y= 7): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 5) phys(x=11,y= 7): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 5) phys(x=12,y= 7): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 6) phys(x= 1,y= 8): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 6) phys(x= 2,y= 8): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 6) phys(x= 3,y= 8): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 6) phys(x= 4,y= 8): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 6) phys(x= 5,y= 8): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 6) phys(x= 6,y= 8): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 6) phys(x= 7,y= 8): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 6) phys(x= 8,y= 8): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 6) phys(x= 9,y= 8): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 6) phys(x=10,y= 8): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 6) phys(x=11,y= 8): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 6) phys(x=12,y= 8): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 7) phys(x= 1,y= 9): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 7) phys(x= 2,y= 9): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 7) phys(x= 3,y= 9): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 7) phys(x= 4,y= 9): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 7) phys(x= 5,y= 9): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 7) phys(x= 6,y= 9): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 7) phys(x= 7,y= 9): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 7) phys(x= 8,y= 9): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 7) phys(x= 9,y= 9): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 7) phys(x=10,y= 9): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 7) phys(x=11,y= 9): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 7) phys(x=12,y= 9): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 8) phys(x= 1,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 8) phys(x= 2,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 8) phys(x= 3,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 8) phys(x= 4,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 8) phys(x= 5,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 8) phys(x= 6,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 8) phys(x= 7,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 8) phys(x= 8,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 8) phys(x= 9,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 8) phys(x=10,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 8) phys(x=11,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 8) phys(x=12,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 9) phys(x= 1,y=11): NTW, PSW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|1|0
Device 0 worker core(x= 6,y= 9) phys(x= 7,y=11): NTW, PWW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|2|0
k_id[0]: blank
k_id[1]: tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
k_id[2]: tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
k_id[3]: ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout.cpp
k_id[4]: ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/writer_bmm_tile_layout.cpp
k_id[5]: ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm.cpp
Stack usage summary:
brisc highest stack usage: 192/752, on core (x=7,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/writer_bmm_tile_layout.cpp
ncrisc highest stack usage: 244/884, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout.cpp
trisc0 highest stack usage: 188/320, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm.cpp
trisc1 highest stack usage: 164/256, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm.cpp
trisc2 highest stack usage: 192/768, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm.cpp
For the following test, there's a hang after the matmuls. Interestingly, even if there's a single matmul with nothing afterwards, and the test passes, trying to run anything afterwards causes a hang.
import pytest
import torch
import ttnn
import time
from models.utility_functions import tt2torch_tensor, torch2tt_tensor
def find_max_subblock(out_block_h, out_block_w):
max_product = 0
best_h = 1
best_w = 1
for h in range(1, out_block_h + 1):
if out_block_h % h == 0: # h is a divisor of out_block_h
for w in range(1, out_block_w + 1):
if out_block_w % w == 0 and h * w <= 8: # w is a divisor and product condition met
if h * w > max_product:
max_product = h * w
best_h = h
best_w = w
if out_block_w > best_w:
best_h = 1
return best_h, best_w, max_product
@pytest.mark.parametrize(
"packer_l1_acc",
[
True,
],
ids=["pack_l1"],
)
@pytest.mark.parametrize(
"fp32_acc_mode",
[
False,
],
ids=["no_fp32"],
)
@pytest.mark.parametrize(
"fidelity",
[
ttnn.MathFidelity.LoFi,
],
ids=["LoFi"],
)
@pytest.mark.parametrize(
"has_bias",
[
False,
],
ids=["no_bias"],
)
@pytest.mark.parametrize(
"in1_in_dram, out_sharded, in0_sharded, M, K, N, activation, grid_size",
[
(False, True, True, 32, 32, 32, None, (1, 1)),
],
)
def test_single_core_matmul(
device,
in0_sharded,
out_sharded,
in1_in_dram,
M,
K,
N,
fidelity,
has_bias,
activation,
packer_l1_acc,
fp32_acc_mode,
grid_size,
function_level_defaults,
):
in0_shape = [1, 1, M, K]
in1_shape = [1, 1, K, N]
bias_shape = [1, 1, N]
num_cores = grid_size[0] * grid_size[1]
in0_block_h = M // 32
in0_block_w = K // num_cores // 32
out_block_h = M // 32
out_block_w = N // num_cores // 32
out_subblock_h, out_subblock_w, _ = find_max_subblock(out_block_h, out_block_w)
interleaved_mem_config = ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.INTERLEAVED,
buffer_type=ttnn.BufferType.DRAM,
)
sharded_mem_config = ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
buffer_type=ttnn.BufferType.L1,
)
in0 = torch.randn(in0_shape).bfloat16().float()
in1 = torch.randn(in1_shape).bfloat16().float()
bias = torch.randn(bias_shape).bfloat16().float()
output_mem_config = sharded_mem_config
in0_t = torch2tt_tensor(in0, device, tt_memory_config=interleaved_mem_config, tt_dtype=ttnn.bfloat16)
in1_t = torch2tt_tensor(in1, device, tt_memory_config=interleaved_mem_config, tt_dtype=ttnn.bfloat8_b)
if in0_sharded:
in0_t = ttnn.interleaved_to_sharded(
in0_t,
grid_size,
[M, int(in0_block_w * 32)],
ttnn.TensorMemoryLayout.WIDTH_SHARDED,
ttnn.ShardOrientation.ROW_MAJOR,
)
program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig(
compute_with_storage_grid_size=grid_size,
in0_block_w=in0_block_w,
out_subblock_h=out_subblock_h,
out_subblock_w=out_subblock_w,
per_core_M=out_block_h,
per_core_N=out_block_w,
fuse_batch=True,
fused_activation=None,
mcast_in0=True,
)
compute_kernel_config = ttnn.GrayskullComputeKernelConfig(
math_fidelity=fidelity,
math_approx_mode=True,
)
print(f'{in0_t}\n{in1_t}\n{program_config}\n{output_mem_config}\n{compute_kernel_config}\n out_sharded={out_sharded}')
output_t = ttnn.matmul(
in0_t,
in1_t,
program_config=program_config,
memory_config=output_mem_config,
dtype=ttnn.bfloat8_b,
compute_kernel_config=compute_kernel_config,
)
print(f'FIRST OUTPUT: {output_t}')
output_t = ttnn.matmul(
in0_t,
in1_t,
program_config=program_config,
memory_config=output_mem_config,
dtype=ttnn.bfloat8_b,
compute_kernel_config=compute_kernel_config,
)
print(f'SECOND OUTPUT: {output_t}')
if out_sharded:
output_t = ttnn.sharded_to_interleaved(output_t, interleaved_mem_config)
pt_out = in0 @ in1 + bias
tt_out = tt2torch_tensor(output_t)
passing, output = comp_pcc(pt_out, tt_out)
print(f'output: {output}')
assert passing
Watcher output with non-zero k_ids:
Dump #27 at 325.641s
Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): NWTW, W, W, W, W rmsg:D0G|BNt smsg:DDDD k_ids:12|11|0
...
Device 0 worker core(x= 0,y= 9) phys(x= 1,y=11): NTW, PSW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|1|0
Device 0 worker core(x= 6,y= 9) phys(x= 7,y=11): NTW, PWW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|2|0
k_id[0]: blank
k_id[1]: tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
k_id[2]: tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
k_id[11]: ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/reader_unary_sharded.cpp
k_id[12]: ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/writer_unary_sharded_blocks_interleaved_start_id.cpp
Stack usage summary:
brisc highest stack usage: 188/752, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/writer_unary_sharded.cpp
ncrisc highest stack usage: 224/884, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reader_unary_sharded_blocks_interleaved_start_id.cpp
trisc0 highest stack usage: 104/320, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
trisc1 highest stack usage: 104/256, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
trisc2 highest stack usage: 192/768, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
Dump #27 completed at 325.651s
Interestingly enough, both matmuls completed much earlier and had output:
==================================================== test session starts =====================================================
platform linux -- Python 3.8.10, pytest-7.2.2, pluggy-1.5.0 -- /proj_sw/user_dev/bbradel/tt-metal/python_env/bin/python3
cachedir: .pytest_cache
rootdir: /proj_sw/user_dev/bbradel/tt-metal, configfile: pytest.ini
plugins: anyio-4.4.0, split-0.8.2, timeout-2.2.0, xdist-3.6.1, dash-2.15.0
timeout: 300.0s
timeout method: signal
timeout func_only: False
collected 1 item
m1_test.py::test_single_core_matmul[False-True-True-32-32-32-None-grid_size0-no_bias-LoFi-no_fp32-pack_l1] 2024-09-06 18:00:28.765 | INFO | SiliconDriver - Detected 1 PCI device : [1]
ttnn.Tensor([[[[ 1.03906, -0.92969, ..., 1.03906, -0.55469],
[ 0.61719, -1.59375, ..., -0.03467, 0.33398],
...,
[-0.48047, 1.75000, ..., -0.28906, -0.75000],
[ 0.49219, 1.23438, ..., 0.44336, 0.85156]]]], shape=Shape([1, 1, 32, 32]), dtype=DataType::BFLOAT16, layout=Layout::TILE)
ttnn.Tensor([[[[-1.03125, -0.21875, ..., 0.00000, -1.35938],
[-1.57812, 0.62500, ..., -0.18750, -0.59375],
...,
[-1.07812, 0.26562, ..., 0.12500, 0.59375],
[ 0.35938, -0.23438, ..., 0.21875, -0.59375]]]], shape=Shape([1, 1, 32, 32]), dtype=DataType::BFLOAT8_B, layout=Layout::TILE)
MatmulMultiCoreReuseMultiCast1DProgramConfig(compute_with_storage_grid_size=(x=1,y=1),in0_block_w=1,out_subblock_h=1,out_subblock_w=1,per_core_M=1,per_core_N=1,fuse_batch=1,fused_activation=std::nullopt,mcast_in0=1)
MemoryConfig(memory_layout=TensorMemoryLayout::WIDTH_SHARDED,buffer_type=BufferType::L1,shard_spec=std::nullopt)
<ttnn._ttnn.operations.core.GrayskullComputeKernelConfig object at 0x7fac82dbfcb0>
out_sharded=True
FIRST OUTPUT: ttnn.Tensor([[[[-60.00000, 32.00000, ..., -236.00000, -148.00000],
[-60.00000, 32.00000, ..., -236.00000, -148.00000],
...,
[82.00000, -78.00000, ..., -122.00000, -162.00000],
[82.00000, -78.00000, ..., -122.00000, -162.00000]]]], shape=Shape([1, 1, 32, 32]), dtype=DataType::BFLOAT8_B, layout=Layout::TILE)
SECOND OUTPUT: ttnn.Tensor([[[[-60.00000, 32.00000, ..., -236.00000, -148.00000],
[-60.00000, 32.00000, ..., -236.00000, -148.00000],
...,
[82.00000, -78.00000, ..., -122.00000, -162.00000],
[82.00000, -78.00000, ..., -122.00000, -162.00000]]]], shape=Shape([1, 1, 32, 32]), dtype=DataType::BFLOAT8_B, layout=Layout::TILE)
I reversed the changes in https://github.com/tenstorrent/tt-metal/pull/11520 and now the sweeps no longer hang.
Changes:
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_blo
ck_sharded.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_bl
ock_sharded.cpp
index 5f81e38fdc..d77f83ee9a 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_shar
ded.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_shar
ded.cpp
@@ -189,7 +189,7 @@ void kernel_main() {
local_read_addr,
in0_multicast_data_addr,
in0_block_size_bytes,
- in0_mcast_num_cores - 1);
+ in0_mcast_num_cores - 1, false, false);
}
}
// Mcast from different CB to another CB
@@ -201,8 +201,8 @@ void kernel_main() {
in0_multicast_data_addr,
in0_block_size_bytes,
in0_mcast_num_cores,
- true,
- true);
+ false,
+ false);
}
// We should also multicast the flag to destinations
@@ -215,7 +215,7 @@ void kernel_main() {
noc_semaphore_set_multicast_loopback_src(
in0_mcast_sender_semaphore_valid_addr,
in0_mcast_receiver_semaphore_noc_addr,
- in0_mcast_num_cores);
+ in0_mcast_num_cores, false, false);
}
} else {
// If we are not part of receiver grid, always do a regular noc_async_write_multicast to all cores
@@ -225,14 +225,14 @@ void kernel_main() {
in0_multicast_data_addr,
in0_block_size_bytes,
in0_mcast_num_cores,
- true,
- true);
+ false,
+ false);
// We should also multicast the flag to destinations
noc_semaphore_set_multicast(
in0_mcast_sender_semaphore_valid_addr,
in0_mcast_receiver_semaphore_noc_addr,
- in0_mcast_num_cores);
+ in0_mcast_num_cores, false, false);
}
// Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc,
// same cmd_buf Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC).
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp b
/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp
index 41d09cc92b..db15989e82 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp
@@ -270,7 +270,7 @@ void kernel_main() {
// num_dests must not include source, since we are NOT really doing a local copy!
noc_async_write_multicast(
- in1_start_address, in1_multicast_data_addr, in1_block_size_bytes, in1_mcast_num_cores, true, true);
+ in1_start_address, in1_multicast_data_addr, in1_block_size_bytes, in1_mcast_num_cores, false, false
);
// Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc, same
// cmd_buf Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC).
@@ -280,7 +280,7 @@ void kernel_main() {
noc_semaphore_set_multicast(
in1_mcast_receiver_semaphore_addr,
in1_mcast_receiver_semaphore_noc_addr,
- in1_mcast_num_cores);
+ in1_mcast_num_cores, false, false);
#endif
#ifndef IN1_SHARDED
@@ -351,7 +351,7 @@ void kernel_main() {
// num_dests must not include source, since we are NOT really doing a local copy!
noc_async_write_multicast(
- in3_start_address, in3_multicast_data_addr, in3_block_size_bytes, in1_mcast_num_cores, true, true);
+ in3_start_address, in3_multicast_data_addr, in3_block_size_bytes, in1_mcast_num_cores, false, false
);
// Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc, same
// cmd_buf Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC).
@@ -360,7 +360,7 @@ void kernel_main() {
noc_semaphore_set_multicast(
in1_mcast_receiver_semaphore_addr,
in1_mcast_receiver_semaphore_noc_addr,
- in1_mcast_num_cores);
+ in1_mcast_num_cores, false, false);
#endif
cb_push_back(cb_id_in3, in1_block_w);
Sweep results:
There are many failures in matmul sweeps for
sweep_name.keyword : "matmul.full.matmul_default_block_sharded" and suite_name.keyword : n_size_32
Before there were 0 crash / hang failures.
Right now there are 1088 crash /hang failures. 65 other failures and 383 passes.