tenstorrent / tt-metal

:metal: TT-NN operator library, and TT-Metalium low level kernel programming model.
Apache License 2.0
430 stars 59 forks source link

Large percentage of failures in matmul.full.matmul_default_block_sharded n_size_32 sweeps #12220

Closed bbradelTT closed 4 weeks ago

bbradelTT commented 1 month ago

There are many failures in matmul sweeps for sweep_name.keyword : "matmul.full.matmul_default_block_sharded" and suite_name.keyword : n_size_32

Before there were 0 crash / hang failures.

Right now there are 1088 crash /hang failures. 65 other failures and 383 passes.

bbradelTT commented 1 month ago

Tried local repro. Watcher output:

Dump #6 at 74.819s
Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): NWTW,   W,   W,   W,   W  rmsg:D0G|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 1,y= 0) phys(x= 2,y= 1): NWTW,   W,   W,   W,   W  rmsg:D0G|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 2,y= 0) phys(x= 3,y= 1): NWTW,   W,   W,   W,   W  rmsg:D0G|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 3,y= 0) phys(x= 4,y= 1): NWTW,   W,   W,   W,   W  rmsg:D0G|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 4,y= 0) phys(x= 5,y= 1): NWTW,   W,   W,   W,   W  rmsg:D0G|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 5,y= 0) phys(x= 6,y= 1): NWTW,   W,   W,   W,   W  rmsg:D0G|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 6,y= 0) phys(x= 7,y= 1):   GW,   W,   W,   W,   W  rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 7,y= 0) phys(x= 8,y= 1):   GW,   W,   W,   W,   W  rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 8,y= 0) phys(x= 9,y= 1):   GW,   W,   W,   W,   W  rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 9,y= 0) phys(x=10,y= 1):   GW,   W,   W,   W,   W  rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x=10,y= 0) phys(x=11,y= 1):   GW,   W,   W,   W,   W  rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x=11,y= 0) phys(x=12,y= 1):   GW,   W,   W,   W,   W  rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 0,y= 1) phys(x= 1,y= 2):   GW,   W,   W,   W,   W  rmsg:D0D|BNT smsg:DDDD k_ids:4|3|5
Device 0 worker core(x= 1,y= 1) phys(x= 2,y= 2):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 1) phys(x= 3,y= 2):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 1) phys(x= 4,y= 2):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 1) phys(x= 5,y= 2):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 1) phys(x= 6,y= 2):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 1) phys(x= 7,y= 2):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 1) phys(x= 8,y= 2):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 1) phys(x= 9,y= 2):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 1) phys(x=10,y= 2):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 1) phys(x=11,y= 2):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 1) phys(x=12,y= 2):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 2) phys(x= 1,y= 3):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 2) phys(x= 2,y= 3):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 2) phys(x= 3,y= 3):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 2) phys(x= 4,y= 3):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 2) phys(x= 5,y= 3):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 2) phys(x= 6,y= 3):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 2) phys(x= 7,y= 3):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 2) phys(x= 8,y= 3):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 2) phys(x= 9,y= 3):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 2) phys(x=10,y= 3):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 2) phys(x=11,y= 3):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 2) phys(x=12,y= 3):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 3) phys(x= 1,y= 4):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 3) phys(x= 2,y= 4):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 3) phys(x= 3,y= 4):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 3) phys(x= 4,y= 4):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 3) phys(x= 5,y= 4):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 3) phys(x= 6,y= 4):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 3) phys(x= 7,y= 4):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 3) phys(x= 8,y= 4):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 3) phys(x= 9,y= 4):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 3) phys(x=10,y= 4):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 3) phys(x=11,y= 4):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 3) phys(x=12,y= 4):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 4) phys(x= 1,y= 5):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 4) phys(x= 2,y= 5):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 4) phys(x= 3,y= 5):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 4) phys(x= 4,y= 5):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 4) phys(x= 5,y= 5):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 4) phys(x= 6,y= 5):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 4) phys(x= 7,y= 5):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 4) phys(x= 8,y= 5):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 4) phys(x= 9,y= 5):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 4) phys(x=10,y= 5):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 4) phys(x=11,y= 5):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 4) phys(x=12,y= 5):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 5) phys(x= 1,y= 7):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 5) phys(x= 2,y= 7):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 5) phys(x= 3,y= 7):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 5) phys(x= 4,y= 7):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 5) phys(x= 5,y= 7):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 5) phys(x= 6,y= 7):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 5) phys(x= 7,y= 7):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 5) phys(x= 8,y= 7):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 5) phys(x= 9,y= 7):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 5) phys(x=10,y= 7):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 5) phys(x=11,y= 7):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 5) phys(x=12,y= 7):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 6) phys(x= 1,y= 8):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 6) phys(x= 2,y= 8):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 6) phys(x= 3,y= 8):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 6) phys(x= 4,y= 8):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 6) phys(x= 5,y= 8):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 6) phys(x= 6,y= 8):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 6) phys(x= 7,y= 8):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 6) phys(x= 8,y= 8):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 6) phys(x= 9,y= 8):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 6) phys(x=10,y= 8):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 6) phys(x=11,y= 8):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 6) phys(x=12,y= 8):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 7) phys(x= 1,y= 9):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 7) phys(x= 2,y= 9):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 7) phys(x= 3,y= 9):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 7) phys(x= 4,y= 9):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 7) phys(x= 5,y= 9):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 7) phys(x= 6,y= 9):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 7) phys(x= 7,y= 9):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 7) phys(x= 8,y= 9):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 7) phys(x= 9,y= 9):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 7) phys(x=10,y= 9):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 7) phys(x=11,y= 9):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 7) phys(x=12,y= 9):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 8) phys(x= 1,y=10):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 1,y= 8) phys(x= 2,y=10):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 2,y= 8) phys(x= 3,y=10):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 3,y= 8) phys(x= 4,y=10):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 4,y= 8) phys(x= 5,y=10):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 5,y= 8) phys(x= 6,y=10):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 6,y= 8) phys(x= 7,y=10):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 8) phys(x= 8,y=10):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 8,y= 8) phys(x= 9,y=10):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 9,y= 8) phys(x=10,y=10):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=10,y= 8) phys(x=11,y=10):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x=11,y= 8) phys(x=12,y=10):   GW,   W,   W,   W,   W  rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 9) phys(x= 1,y=11):  NTW, PSW,   W,   W,   W  rmsg:H1G|bNt smsg:GDDD k_ids:0|1|0
Device 0 worker core(x= 6,y= 9) phys(x= 7,y=11):  NTW, PWW,   W,   W,   W  rmsg:H1G|bNt smsg:GDDD k_ids:0|2|0
k_id[0]: blank
k_id[1]: tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
k_id[2]: tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
k_id[3]: ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout.cpp
k_id[4]: ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/writer_bmm_tile_layout.cpp
k_id[5]: ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm.cpp
Stack usage summary:
        brisc highest stack usage: 192/752, on core (x=7,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/writer_bmm_tile_layout.cpp
        ncrisc highest stack usage: 244/884, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout.cpp
        trisc0 highest stack usage: 188/320, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm.cpp
        trisc1 highest stack usage: 164/256, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm.cpp
        trisc2 highest stack usage: 192/768, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm.cpp
bbradelTT commented 1 month ago

For the following test, there's a hang after the matmuls. Interestingly, even if there's a single matmul with nothing afterwards, and the test passes, trying to run anything afterwards causes a hang.

import pytest
import torch
import ttnn
import time

from models.utility_functions import tt2torch_tensor, torch2tt_tensor

def find_max_subblock(out_block_h, out_block_w):
    max_product = 0
    best_h = 1
    best_w = 1

    for h in range(1, out_block_h + 1):
        if out_block_h % h == 0:  # h is a divisor of out_block_h
            for w in range(1, out_block_w + 1):
                if out_block_w % w == 0 and h * w <= 8:  # w is a divisor and product condition met
                    if h * w > max_product:
                        max_product = h * w
                        best_h = h
                        best_w = w
    if out_block_w > best_w:
        best_h = 1
    return best_h, best_w, max_product

@pytest.mark.parametrize(
    "packer_l1_acc",
    [
        True,
    ],
    ids=["pack_l1"],
)
@pytest.mark.parametrize(
    "fp32_acc_mode",
    [
        False,
    ],
    ids=["no_fp32"],
)
@pytest.mark.parametrize(
    "fidelity",
    [
        ttnn.MathFidelity.LoFi,
    ],
    ids=["LoFi"],
)
@pytest.mark.parametrize(
    "has_bias",
    [
        False,
    ],
    ids=["no_bias"],
)
@pytest.mark.parametrize(
    "in1_in_dram, out_sharded, in0_sharded, M, K, N, activation, grid_size",
    [
        (False, True, True, 32, 32, 32, None, (1, 1)),
    ],
)
def test_single_core_matmul(
    device,
    in0_sharded,
    out_sharded,
    in1_in_dram,
    M,
    K,
    N,
    fidelity,
    has_bias,
    activation,
    packer_l1_acc,
    fp32_acc_mode,
    grid_size,
    function_level_defaults,
):
    in0_shape = [1, 1, M, K]
    in1_shape = [1, 1, K, N]
    bias_shape = [1, 1, N]
    num_cores = grid_size[0] * grid_size[1]

    in0_block_h = M // 32
    in0_block_w = K // num_cores // 32
    out_block_h = M // 32
    out_block_w = N // num_cores // 32

    out_subblock_h, out_subblock_w, _ = find_max_subblock(out_block_h, out_block_w)

    interleaved_mem_config = ttnn.MemoryConfig(
        memory_layout=ttnn.TensorMemoryLayout.INTERLEAVED,
        buffer_type=ttnn.BufferType.DRAM,
    )
    sharded_mem_config = ttnn.MemoryConfig(
        memory_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
        buffer_type=ttnn.BufferType.L1,
    )

    in0 = torch.randn(in0_shape).bfloat16().float()
    in1 = torch.randn(in1_shape).bfloat16().float()
    bias = torch.randn(bias_shape).bfloat16().float()

    output_mem_config = sharded_mem_config

    in0_t = torch2tt_tensor(in0, device, tt_memory_config=interleaved_mem_config, tt_dtype=ttnn.bfloat16)
    in1_t = torch2tt_tensor(in1, device, tt_memory_config=interleaved_mem_config, tt_dtype=ttnn.bfloat8_b)

    if in0_sharded:
        in0_t = ttnn.interleaved_to_sharded(
            in0_t,
            grid_size,
            [M, int(in0_block_w * 32)],
            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
            ttnn.ShardOrientation.ROW_MAJOR,
        )

    program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig(
        compute_with_storage_grid_size=grid_size,
        in0_block_w=in0_block_w,
        out_subblock_h=out_subblock_h,
        out_subblock_w=out_subblock_w,
        per_core_M=out_block_h,
        per_core_N=out_block_w,
        fuse_batch=True,
        fused_activation=None,
        mcast_in0=True,
    )

    compute_kernel_config = ttnn.GrayskullComputeKernelConfig(
        math_fidelity=fidelity,
        math_approx_mode=True,
    )

    print(f'{in0_t}\n{in1_t}\n{program_config}\n{output_mem_config}\n{compute_kernel_config}\n out_sharded={out_sharded}')
    output_t = ttnn.matmul(
        in0_t,
        in1_t,
        program_config=program_config,
        memory_config=output_mem_config,
        dtype=ttnn.bfloat8_b,
        compute_kernel_config=compute_kernel_config,
    )
    print(f'FIRST OUTPUT: {output_t}')
    output_t = ttnn.matmul(
        in0_t,
        in1_t,
        program_config=program_config,
        memory_config=output_mem_config,
        dtype=ttnn.bfloat8_b,
        compute_kernel_config=compute_kernel_config,
    )
    print(f'SECOND OUTPUT: {output_t}')
    if out_sharded:
        output_t = ttnn.sharded_to_interleaved(output_t, interleaved_mem_config)
    pt_out = in0 @ in1 + bias

    tt_out = tt2torch_tensor(output_t)

    passing, output = comp_pcc(pt_out, tt_out)
    print(f'output: {output}')
    assert passing
bbradelTT commented 1 month ago

Watcher output with non-zero k_ids:

Dump #27 at 325.641s
Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): NWTW,   W,   W,   W,   W  rmsg:D0G|BNt smsg:DDDD k_ids:12|11|0
...
Device 0 worker core(x= 0,y= 9) phys(x= 1,y=11):  NTW, PSW,   W,   W,   W  rmsg:H1G|bNt smsg:GDDD k_ids:0|1|0
Device 0 worker core(x= 6,y= 9) phys(x= 7,y=11):  NTW, PWW,   W,   W,   W  rmsg:H1G|bNt smsg:GDDD k_ids:0|2|0
k_id[0]: blank
k_id[1]: tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
k_id[2]: tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
k_id[11]: ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/reader_unary_sharded.cpp
k_id[12]: ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/writer_unary_sharded_blocks_interleaved_start_id.cpp
Stack usage summary:
        brisc highest stack usage: 188/752, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/writer_unary_sharded.cpp
        ncrisc highest stack usage: 224/884, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reader_unary_sharded_blocks_interleaved_start_id.cpp
        trisc0 highest stack usage: 104/320, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
        trisc1 highest stack usage: 104/256, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
        trisc2 highest stack usage: 192/768, on core (x=1,y=1), running kernel ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
Dump #27 completed at 325.651s
bbradelTT commented 1 month ago

Interestingly enough, both matmuls completed much earlier and had output:

==================================================== test session starts =====================================================
platform linux -- Python 3.8.10, pytest-7.2.2, pluggy-1.5.0 -- /proj_sw/user_dev/bbradel/tt-metal/python_env/bin/python3
cachedir: .pytest_cache
rootdir: /proj_sw/user_dev/bbradel/tt-metal, configfile: pytest.ini
plugins: anyio-4.4.0, split-0.8.2, timeout-2.2.0, xdist-3.6.1, dash-2.15.0
timeout: 300.0s
timeout method: signal
timeout func_only: False
collected 1 item                                                                                                             

m1_test.py::test_single_core_matmul[False-True-True-32-32-32-None-grid_size0-no_bias-LoFi-no_fp32-pack_l1] 2024-09-06 18:00:28.765 | INFO     | SiliconDriver   - Detected 1 PCI device : [1]
ttnn.Tensor([[[[ 1.03906, -0.92969,  ...,  1.03906, -0.55469],
               [ 0.61719, -1.59375,  ..., -0.03467,  0.33398],
               ...,
               [-0.48047,  1.75000,  ..., -0.28906, -0.75000],
               [ 0.49219,  1.23438,  ...,  0.44336,  0.85156]]]], shape=Shape([1, 1, 32, 32]), dtype=DataType::BFLOAT16, layout=Layout::TILE)
ttnn.Tensor([[[[-1.03125, -0.21875,  ...,  0.00000, -1.35938],
               [-1.57812,  0.62500,  ..., -0.18750, -0.59375],
               ...,
               [-1.07812,  0.26562,  ...,  0.12500,  0.59375],
               [ 0.35938, -0.23438,  ...,  0.21875, -0.59375]]]], shape=Shape([1, 1, 32, 32]), dtype=DataType::BFLOAT8_B, layout=Layout::TILE)
MatmulMultiCoreReuseMultiCast1DProgramConfig(compute_with_storage_grid_size=(x=1,y=1),in0_block_w=1,out_subblock_h=1,out_subblock_w=1,per_core_M=1,per_core_N=1,fuse_batch=1,fused_activation=std::nullopt,mcast_in0=1)
MemoryConfig(memory_layout=TensorMemoryLayout::WIDTH_SHARDED,buffer_type=BufferType::L1,shard_spec=std::nullopt)
<ttnn._ttnn.operations.core.GrayskullComputeKernelConfig object at 0x7fac82dbfcb0>
 out_sharded=True
FIRST OUTPUT: ttnn.Tensor([[[[-60.00000, 32.00000,  ..., -236.00000, -148.00000],
               [-60.00000, 32.00000,  ..., -236.00000, -148.00000],
               ...,
               [82.00000, -78.00000,  ..., -122.00000, -162.00000],
               [82.00000, -78.00000,  ..., -122.00000, -162.00000]]]], shape=Shape([1, 1, 32, 32]), dtype=DataType::BFLOAT8_B, layout=Layout::TILE)
SECOND OUTPUT: ttnn.Tensor([[[[-60.00000, 32.00000,  ..., -236.00000, -148.00000],
               [-60.00000, 32.00000,  ..., -236.00000, -148.00000],
               ...,
               [82.00000, -78.00000,  ..., -122.00000, -162.00000],
               [82.00000, -78.00000,  ..., -122.00000, -162.00000]]]], shape=Shape([1, 1, 32, 32]), dtype=DataType::BFLOAT8_B, layout=Layout::TILE)
bbradelTT commented 1 month ago

I reversed the changes in https://github.com/tenstorrent/tt-metal/pull/11520 and now the sweeps no longer hang.

Changes:

diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_blo
ck_sharded.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_bl
ock_sharded.cpp
index 5f81e38fdc..d77f83ee9a 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_shar
ded.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_shar
ded.cpp
@@ -189,7 +189,7 @@ void kernel_main() {
                                 local_read_addr,
                                 in0_multicast_data_addr,
                                 in0_block_size_bytes,
-                                in0_mcast_num_cores - 1);
+                                in0_mcast_num_cores - 1, false, false);
                         }
                     }
                     // Mcast from different CB to another CB
@@ -201,8 +201,8 @@ void kernel_main() {
                             in0_multicast_data_addr,
                             in0_block_size_bytes,
                             in0_mcast_num_cores,
-                            true,
-                            true);
+                            false,
+                            false);
                     }

                     // We should also multicast the flag to destinations
@@ -215,7 +215,7 @@ void kernel_main() {
                         noc_semaphore_set_multicast_loopback_src(
                             in0_mcast_sender_semaphore_valid_addr,
                             in0_mcast_receiver_semaphore_noc_addr,
-                            in0_mcast_num_cores);
+                            in0_mcast_num_cores, false, false);
                     }
                 } else {
                     // If we are not part of receiver grid, always do a regular noc_async_write_multicast to all cores
@@ -225,14 +225,14 @@ void kernel_main() {
                         in0_multicast_data_addr,
                         in0_block_size_bytes,
                         in0_mcast_num_cores,
-                        true,
-                        true);
+                        false,
+                        false);

                     // We should also multicast the flag to destinations
                     noc_semaphore_set_multicast(
                         in0_mcast_sender_semaphore_valid_addr,
                         in0_mcast_receiver_semaphore_noc_addr,
-                        in0_mcast_num_cores);
+                        in0_mcast_num_cores, false, false);
                 }
                 // Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc,
                 // same cmd_buf Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC).
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp b
/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp
index 41d09cc92b..db15989e82 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp
@@ -270,7 +270,7 @@ void kernel_main() {

             // num_dests must not include source, since we are NOT really doing a local copy!
             noc_async_write_multicast(
-                in1_start_address, in1_multicast_data_addr, in1_block_size_bytes, in1_mcast_num_cores, true, true);
+                in1_start_address, in1_multicast_data_addr, in1_block_size_bytes, in1_mcast_num_cores, false, false
);

             // Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc, same
             // cmd_buf Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC).
@@ -280,7 +280,7 @@ void kernel_main() {
             noc_semaphore_set_multicast(
                 in1_mcast_receiver_semaphore_addr,
                 in1_mcast_receiver_semaphore_noc_addr,
-                in1_mcast_num_cores);
+                in1_mcast_num_cores, false, false);
 #endif

 #ifndef IN1_SHARDED
@@ -351,7 +351,7 @@ void kernel_main() {

             // num_dests must not include source, since we are NOT really doing a local copy!
             noc_async_write_multicast(
-                in3_start_address, in3_multicast_data_addr, in3_block_size_bytes, in1_mcast_num_cores, true, true);
+                in3_start_address, in3_multicast_data_addr, in3_block_size_bytes, in1_mcast_num_cores, false, false
);
             // Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc, same
             // cmd_buf Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC).

@@ -360,7 +360,7 @@ void kernel_main() {
             noc_semaphore_set_multicast(
                 in1_mcast_receiver_semaphore_addr,
                 in1_mcast_receiver_semaphore_noc_addr,
-                in1_mcast_num_cores);
+                in1_mcast_num_cores, false, false);
 #endif

             cb_push_back(cb_id_in3, in1_block_w);

Sweep results: image