Describe the bug
When testing GroupNorm, some shapes cause stack overflow on trisc0. This is probably caused by inline not always work and. cause the function call depth to blow up.
To Reproduce
Steps to reproduce the behavior:
Go to 'yugao/gn_stack_overflow' and run TT_METAL_WATCHER=5 pytest tests/ttnn/unit_tests/operations/test_group_norm.py
to avoid the overflow, use the git diff here to apply to llk third party
-inline constexpr int get_math_num_fidelity_phases(const int math_fidelity_desc)
+ALWI constexpr int get_math_num_fidelity_phases(const int math_fidelity_desc)
{
return (math_fidelity_desc & 0x7);
}
-inline constexpr int get_math_fidelity_increment(const int math_fidelity_desc)
+ALWI constexpr int get_math_fidelity_increment(const int math_fidelity_desc)
{
return ((math_fidelity_desc >> 3) & 0x1) + 1;
}
diff --git a/common/inc/cunpack_common.h b/common/inc/cunpack_common.h
index 66a1f5d..b82cbcf 100644
--- a/common/inc/cunpack_common.h
+++ b/common/inc/cunpack_common.h
@@ -361,7 +361,7 @@ namespace ckernel::unpacker
}
Describe the bug When testing GroupNorm, some shapes cause stack overflow on trisc0. This is probably caused by inline not always work and. cause the function call depth to blow up.
To Reproduce Steps to reproduce the behavior:
TT_METAL_WATCHER=5 pytest tests/ttnn/unit_tests/operations/test_group_norm.py
to avoid the overflow, use the git diff here to apply to llk third party
-inline constexpr int get_math_num_fidelity_phases(const int math_fidelity_desc) +ALWI constexpr int get_math_num_fidelity_phases(const int math_fidelity_desc) { return (math_fidelity_desc & 0x7); }
-inline constexpr int get_math_fidelity_increment(const int math_fidelity_desc) +ALWI constexpr int get_math_fidelity_increment(const int math_fidelity_desc) { return ((math_fidelity_desc >> 3) & 0x1) + 1; } diff --git a/common/inc/cunpack_common.h b/common/inc/cunpack_common.h index 66a1f5d..b82cbcf 100644 --- a/common/inc/cunpack_common.h +++ b/common/inc/cunpack_common.h @@ -361,7 +361,7 @@ namespace ckernel::unpacker }
ALWI void config_unpacker_x_end(const uint32_t face_r_dim) { switch (face_r_dim) { case 1: diff --git a/llk_lib/llk_math_eltwise_binary.h b/llk_lib/llk_math_eltwise_binary.h index 4035f5b..7d1ea02 100644 --- a/llk_lib/llk_math_eltwise_binary.h +++ b/llk_lib/llk_math_eltwise_binary.h @@ -211,7 +211,7 @@ inline void _llk_math_eltwisebinary(const std::uint32_t num_faces, uint dst_in
template <EltwiseBinaryType eltwise_binary_type, BroadcastType bcast_type, std::uint32_t FIDELITY_INCREMENT> -inline void eltwise_binary_configure_addrmod() { +ALWI void eltwise_binary_configure_addrmod() { // Use srcA for data movement if constexpr ( (eltwise_binary_type == ELWADD) || (eltwise_binary_type == ELWSUB) || (eltwise_binary_type == ELWMUL)) { @@ -258,7 +258,7 @@ template < BroadcastType bcast_type, int NUM_FIDELITY_PHASES = 0, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> -inline void eltwise_binary_configure_mop(const std::uint32_t acc_to_dest = 0, const std::uint32_t num_faces = 4) { +ALWI void eltwise_binary_configure_mop(const std::uint32_t acc_to_dest = 0, const std::uint32_t num_faces = 4) { constexpr bool high_fidelity = (NUM_FIDELITY_PHASES > 0); const uint addr_mod = ADDR_MOD_0; constexpr uint innerloop = 16 >> 3; // 8 rows per eltwise op at a time. @@ -333,7 +333,7 @@ template < BroadcastType src_b_bcast_type, int MATH_FIDELITY_DESC = 0, EltwiseBinaryReuseDestType binary_reuse_dest = EltwiseBinaryReuseDestType::NONE> -inline void _llk_math_eltwise_binaryinit(const std::uint32_t num_faces, const std::uint32_t transpose, const std::uint32_t acc_to_dest) { +ALWI void _llk_math_eltwise_binaryinit(const std::uint32_t num_faces, const std::uint32_t transpose, const std::uint32_t acc_to_dest) {
constexpr int MATH_FIDELITY_PHASES = get_math_num_fidelity_phases(MATH_FIDELITY_DESC); constexpr int MATH_FIDELITY_INCREMENT = get_math_fidelity_increment(MATH_FIDELITY_DESC); diff --git a/llk_lib/llk_unpack_AB.h b/llk_lib/llk_unpack_AB.h index 4c8ab17..1c95842 100644 --- a/llk_lib/llk_unpack_AB.h +++ b/llk_lib/llk_unpack_AB.h @@ -14,7 +14,7 @@ using namespace ckernel; using namespace ckernel::unpacker;
template
-inline void _llk_unpack_AB_mopconfig(const bool transpose_of_faces=false, const std::uint32_t num_faces=4, const bool narrow_tile=false) {
+ALWI void _llk_unpack_AB_mopconfig(const bool transpose_of_faces=false, const std::uint32_t num_faces=4, const bool narrow_tile=false) {
if SKIP_UNP == 1
static constexpr uint unpack_srca = TT_OP_NOP; static constexpr uint unpack_srcb = TT_OP_NOP; @@ -90,7 +90,7 @@ inline void _llk_unpack_AB_hwconfigure(const std::uint32_t unpA_src_format, co }
template
-inline void _llk_unpack_ABinit(const std::uint32_t face_r_dim=FACE_R_DIM, const std::uint32_t num_faces=4, const bool narrow_tile=false, const std::uint32_t transpose=0, const std::uint32_t acc_to_dest=0) {
+ALWI void _llk_unpack_ABinit(const std::uint32_t face_r_dim=FACE_R_DIM, const std::uint32_t num_faces=4, const bool narrow_tile=false, const std::uint32_t transpose=0, const std::uint32_t acc_to_dest=0) {
cfg_reg_rmw_tensix(transpose); // transpose within the face
Expected behavior changing to always inline makes the test pass.