LLK functions inline not always work

Describe the bug When testing GroupNorm, some shapes cause stack overflow on trisc0. This is probably caused by inline not always work and. cause the function call depth to blow up.

To Reproduce Steps to reproduce the behavior:

Go to 'yugao/gn_stack_overflow' and run TT_METAL_WATCHER=5 pytest tests/ttnn/unit_tests/operations/test_group_norm.py

to avoid the overflow, use the git diff here to apply to llk third party


diff --git a/common/inc/ckernel.h b/common/inc/ckernel.h
index 37adc62..a30ac47 100644
--- a/common/inc/ckernel.h
+++ b/common/inc/ckernel.h
@@ -356,7 +356,7 @@ inline void cfg_rmw_gpr(uint32_t cfg_addr32, uint32_t cfg_shamt, uint32_t cfg_ma
}

template <uint CfgAddr32, uint Shamt, uint Mask>
-inline void cfg_reg_rmw_tensix(uint32_t val)
+inline __attribute__((always_inline)) void cfg_reg_rmw_tensix(uint32_t val)
{
 uint32_t wrdata = val<<Shamt;
 uint8_t mask_b0 = Mask & 0xff;
diff --git a/common/inc/cmath_common.h b/common/inc/cmath_common.h
index 7470385..60a5ea3 100644
--- a/common/inc/cmath_common.h
+++ b/common/inc/cmath_common.h
@@ -228,12 +228,12 @@ inline constexpr bool is_32bit_input(const std::uint32_t src_format, const std::
        ((output_df == (uint)DataFormat::Int32) || (output_df == (uint)DataFormat::Float32));
}

-inline constexpr int get_math_num_fidelity_phases(const int math_fidelity_desc) +ALWI constexpr int get_math_num_fidelity_phases(const int math_fidelity_desc) { return (math_fidelity_desc & 0x7); }

-inline constexpr int get_math_fidelity_increment(const int math_fidelity_desc) +ALWI constexpr int get_math_fidelity_increment(const int math_fidelity_desc) { return ((math_fidelity_desc >> 3) & 0x1) + 1; } diff --git a/common/inc/cunpack_common.h b/common/inc/cunpack_common.h index 66a1f5d..b82cbcf 100644 --- a/common/inc/cunpack_common.h +++ b/common/inc/cunpack_common.h @@ -361,7 +361,7 @@ namespace ckernel::unpacker }

template <std::uint32_t UNP_SEL = p_setadc::UNP_AB>

inline void config_unpacker_x_end(const uint32_t face_r_dim)
ALWI void config_unpacker_x_end(const uint32_t face_r_dim) { switch (face_r_dim) { case 1: diff --git a/llk_lib/llk_math_eltwise_binary.h b/llk_lib/llk_math_eltwise_binary.h index 4035f5b..7d1ea02 100644 --- a/llk_lib/llk_math_eltwise_binary.h +++ b/llk_lib/llk_math_eltwise_binary.h @@ -211,7 +211,7 @@ inline void _llk_math_eltwisebinary(const std::uint32_t num_faces, uint dst_in

template <EltwiseBinaryType eltwise_binary_type, BroadcastType bcast_type, std::uint32_t FIDELITY_INCREMENT> -inline void eltwise_binary_configure_addrmod() { +ALWI void eltwise_binary_configure_addrmod() { // Use srcA for data movement if constexpr ( (eltwise_binary_type == ELWADD) || (eltwise_binary_type == ELWSUB) || (eltwise_binary_type == ELWMUL)) { @@ -258,7 +258,7 @@ template < BroadcastType bcast_type, int NUM_FIDELITY_PHASES = 0, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> -inline void eltwise_binary_configure_mop(const std::uint32_t acc_to_dest = 0, const std::uint32_t num_faces = 4) { +ALWI void eltwise_binary_configure_mop(const std::uint32_t acc_to_dest = 0, const std::uint32_t num_faces = 4) { constexpr bool high_fidelity = (NUM_FIDELITY_PHASES > 0); const uint addr_mod = ADDR_MOD_0; constexpr uint innerloop = 16 >> 3; // 8 rows per eltwise op at a time. @@ -333,7 +333,7 @@ template < BroadcastType src_b_bcast_type, int MATH_FIDELITY_DESC = 0, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> -inline void _llk_math_eltwise_binaryinit(const std::uint32_t num_faces, const std::uint32_t transpose, const std::uint32_t acc_to_dest) { +ALWI void _llk_math_eltwise_binaryinit(const std::uint32_t num_faces, const std::uint32_t transpose, const std::uint32_t acc_to_dest) {

constexpr int MATH_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC); constexpr int MATH_FIDELITY_INCREMENT = get_math_fidelity_increment(MATH_FIDELITY_DESC); diff --git a/llk_lib/llk_unpack_AB.h b/llk_lib/llk_unpack_AB.h index 4c8ab17..1c95842 100644 --- a/llk_lib/llk_unpack_AB.h +++ b/llk_lib/llk_unpack_AB.h @@ -14,7 +14,7 @@ using namespace ckernel; using namespace ckernel::unpacker;

template -inline void _llk_unpack_AB_mopconfig(const bool transpose_of_faces=false, const std::uint32_t num_faces=4, const bool narrow_tile=false) { +ALWI void _llk_unpack_AB_mopconfig(const bool transpose_of_faces=false, const std::uint32_t num_faces=4, const bool narrow_tile=false) {

if SKIP_UNP == 1

static constexpr uint unpack_srca = TT_OP_NOP; static constexpr uint unpack_srcb = TT_OP_NOP; @@ -90,7 +90,7 @@ inline void _llk_unpack_AB_hwconfigure(const std::uint32_t unpA_src_format, co }

template -inline void _llk_unpack_ABinit(const std::uint32_t face_r_dim=FACE_R_DIM, const std::uint32_t num_faces=4, const bool narrow_tile=false, const std::uint32_t transpose=0, const std::uint32_t acc_to_dest=0) { +ALWI void _llk_unpack_ABinit(const std::uint32_t face_r_dim=FACE_R_DIM, const std::uint32_t num_faces=4, const bool narrow_tile=false, const std::uint32_t transpose=0, const std::uint32_t acc_to_dest=0) {

cfg_reg_rmw_tensix(transpose); // transpose within the face

Expected behavior changing to always inline makes the test pass.

tenstorrent / tt-metal

LLK functions inline not always work #11265

if SKIP_UNP == 1