tenstorrent / tt-metal

:metal: TT-NN operator library, and TT-Metalium low level kernel programming model.
https://docs.tenstorrent.com/ttnn/latest/index.html
Apache License 2.0
485 stars 79 forks source link

Watcher reads wrong command buffer when using Dynamic NOC setup #15338

Open tt-aho opened 4 days ago

tt-aho commented 4 days ago

In dataflow_api.h, there are now aliases for the specific command buffers used when in the original/DEDICATED_NOC mode, and DYNAMIC_NOC mode.

#if defined(KERNEL_BUILD)
#if defined(COMPILE_FOR_BRISC)
constexpr uint32_t read_cmd_buf = NOC_MODE == DM_DEDICATED_NOC ? BRISC_RD_CMD_BUF : DYNAMIC_NOC_BRISC_RD_CMD_BUF;
constexpr uint32_t write_cmd_buf = NOC_MODE == DM_DEDICATED_NOC ? BRISC_WR_CMD_BUF : DYNAMIC_NOC_BRISC_WR_CMD_BUF;
constexpr uint32_t write_reg_cmd_buf = NOC_MODE == DM_DEDICATED_NOC ? BRISC_WR_REG_CMD_BUF : DYNAMIC_NOC_BRISC_WR_REG_CMD_BUF;
constexpr uint32_t write_at_cmd_buf = NOC_MODE == DM_DEDICATED_NOC ? BRISC_AT_CMD_BUF : DYNAMIC_NOC_BRISC_AT_CMD_BUF;
#elif defined(COMPILE_FOR_NCRISC)
constexpr uint32_t read_cmd_buf = NOC_MODE == DM_DEDICATED_NOC ? NCRISC_RD_CMD_BUF : DYNAMIC_NOC_NCRISC_RD_CMD_BUF;
constexpr uint32_t write_cmd_buf = NOC_MODE == DM_DEDICATED_NOC ? NCRISC_WR_CMD_BUF : DYNAMIC_NOC_NCRISC_WR_CMD_BUF;
constexpr uint32_t write_reg_cmd_buf = NOC_MODE == DM_DEDICATED_NOC ? NCRISC_WR_REG_CMD_BUF : DYNAMIC_NOC_NCRISC_WR_REG_CMD_BUF;
constexpr uint32_t write_at_cmd_buf = NOC_MODE == DM_DEDICATED_NOC ? NCRISC_AT_CMD_BUF : DYNAMIC_NOC_NCRISC_AT_CMD_BUF;
#else // use the default cmf buffers for compute/eth
constexpr uint32_t read_cmd_buf = NCRISC_RD_CMD_BUF;
constexpr uint32_t write_cmd_buf = NCRISC_WR_CMD_BUF;
constexpr uint32_t write_reg_cmd_buf = NCRISC_WR_REG_CMD_BUF;
constexpr uint32_t write_at_cmd_buf = NCRISC_AT_CMD_BUF;
#endif
#else // FW build
constexpr uint32_t read_cmd_buf = NCRISC_RD_CMD_BUF;
constexpr uint32_t write_cmd_buf = NCRISC_WR_CMD_BUF;
constexpr uint32_t write_reg_cmd_buf = NCRISC_WR_REG_CMD_BUF;
constexpr uint32_t write_at_cmd_buf = NCRISC_AT_CMD_BUF;
#endif

However, in tt_metal/hw/inc/debug/sanitize_noc.h, it still uses the hardcoded cmd buffers, resulting in it reading from the wrong command buffer, ex for DEBUG_SANITIZE_NOC_WRITE_TRANSACTION_WITH_ADDR_AND_SIZE_STATE when using DYNAMIC_NOC. It reads 0 for the size when running ./build/test/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul --m 32 --k 2048 --n 128 --num-blocks 8 --cb-num-blocks 10 --cb-padding 256 --num-tests 1 --data-type 1 --num-receivers 2 --num-layers 15 because it is reading from the wrong command buffer.