Closed naoyam closed 1 week ago
It looks like this was found related to the pad operation.
I implemented as only print out out()
and we will get something like:
T1_g_float[ iblockIdx.x18{( ceilDiv(( ceilDiv(( 1 * ( ( ( i1 + ( (nvfuser_index_t)(( i13 )) ) ) + ( (nvfuser_index_t)(( i16 )) ) ) * ( ( i2 + ( (nvfuser_index_t)(( i19 )) ) ) + ( (nvfuser_index_t)(( i22 )) ) ) ) ), 128) ), 1) )}, iUS19{1}, ithreadIdx.x17{128} ] ca_pos( 2 ) produce_pos( 3 )
= Set( T2_l_float[ iblockIdx.x24{( ceilDiv(( ceilDiv(( 1 * ( ( ( i1 + ( (nvfuser_index_t)(( i13 )) ) ) + ( (nvfuser_index_t)(( i16 )) ) ) * ( ( i2 + ( (nvfuser_index_t)(( i19 )) ) ) + ( (nvfuser_index_t)(( i22 )) ) ) ) ), 128) ), 1) )}, iUS25{1}, ithreadIdx.x23{128} ] ca_pos( 3 ), cache_op=Streaming )
} // %kernel_math
If we want to inline everything, e.g. print all the set
, we will get something looks tooo verbose:
T1_g_float[ iblockIdx.x18{( ceilDiv(( ceilDiv(( 1 * ( ( ( i1 + ( (nvfuser_index_t)(( i13
= Set( i11 )) ) ) + ( (nvfuser_index_t)(( i16
= Set( i11 )) ) ) * ( ( i2 + ( (nvfuser_index_t)(( i19
= Set( i11 )) ) ) + ( (nvfuser_index_t)(( i22
= Set( i11 )) ) ) ) ), 128) ), 1) )}, iUS19{1}, ithreadIdx.x17{128} ] ca_pos( 2 ) produce_pos( 3 )
= Set( T2_l_float[ iblockIdx.x24{( ceilDiv(( ceilDiv(( 1 * ( ( ( i1 + ( (nvfuser_index_t)(( i13
= Set( i11 )) ) ) + ( (nvfuser_index_t)(( i16
= Set( i11 )) ) ) * ( ( i2 + ( (nvfuser_index_t)(( i19
= Set( i11 )) ) ) + ( (nvfuser_index_t)(( i22
= Set( i11 )) ) ) ) ), 128) ), 1) )}, iUS25{1}, ithreadIdx.x23{128} ] ca_pos( 3 ), cache_op=Streaming )
Does this mean we have:
i13 = Set( i11);
If so, shouldn't we print i11
instead of i13
? When we have, for example, ops like:
i1 = i2 + i3
Inlined printing would be i2 + i3
instead of i1
.
print the input
is also fine. I selected output
becuase the fusion should have the definition of the original set. For example, the fusion has
i13
= Set( i11, cache_op=Streaming )
The whole fusion:
%kernel_math {
f6 = (float)(7);
f9 = float(2.5) * f6;
i11 = (int64_t)(f9);
i13
= Set( i11, cache_op=Streaming )
i29 = (nvfuser_index_t)(i13);
i16
= Set( i11, cache_op=Streaming )
i31 = (nvfuser_index_t)(i16);
i19
= Set( i11, cache_op=Streaming )
i33 = (nvfuser_index_t)(i19);
i22
= Set( i11, cache_op=Streaming )
i35 = (nvfuser_index_t)(i22);
T2_l_float[ iblockIdx.x24{( ceilDiv(( ceilDiv(( 1 * ( ( ( i1 + ( (nvfuser_index_t)(( i13 )) ) ) + ( (nvfuser_index_t)(( i16 )) ) ) * ( ( i2 + ( (nvfuser_index_t)(( i19 )) ) ) + ( (nvfuser_index_t)(( i22 )) ) ) ) ), 128) ), 1) )}, iUS25{1}, ithreadIdx.x23{128} ] ca_pos( 3 )
= pad( T0_g_float[ iS30{( ceilDiv(( ceilDiv(( 1 * ( i1 * i2 ) ), 128) ), 1) )}, iS31{1}, iS29{128} ], {0, 0, i29, i31, i33, i35} )
T1_g_float[ iblockIdx.x18{( ceilDiv(( ceilDiv(( 1 * ( ( ( i1 + ( (nvfuser_index_t)(( i13 )) ) ) + ( (nvfuser_index_t)(( i16 )) ) ) * ( ( i2 + ( (nvfuser_index_t)(( i19 )) ) ) + ( (nvfuser_index_t)(( i22 )) ) ) ) ), 128) ), 1) )}, iUS19{1}, ithreadIdx.x17{128} ] ca_pos( 2 ) produce_pos( 3 )
= Set( T2_l_float[ iblockIdx.x24{( ceilDiv(( ceilDiv(( 1 * ( ( ( i1 + ( (nvfuser_index_t)(( i13 )) ) ) + ( (nvfuser_index_t)(( i16 )) ) ) * ( ( i2 + ( (nvfuser_index_t)(( i19 )) ) ) + ( (nvfuser_index_t)(( i22 )) ) ) ) ), 128) ), 1) )}, iUS25{1}, ithreadIdx.x23{128} ] ca_pos( 3 ), cache_op=Streaming )
} // %kernel_math
i1 = i2 + i3
is not a set, it is a binary add.
For inlined printing, I don't think there's any reason to treat i1 = i2
and i1 = i2 + i3
differently. For a scalar Val, toInlineString()
returns a string that only consists of symbols that have no defining expressions. I think that would make sense for a scalar val defined with set
too.
Sure, we can do that. It give us an output like
T1_g_float[ iblockIdx.x18{( ceilDiv(( ceilDiv(( 1 * ( ( ( i1 + ( (nvfuser_index_t)(( ( (int64_t)(( float(2.5) * ( (float)(7) ) )) ) )) ) ) + ( (nvfuser_index_t)(( ( (int64_t)(( float(2.5) * ( (float)(7) ) )) ) )) ) ) * ( ( i2 + ( (nvfuser_index_t)(( ( (int64_t)(( float(2.5) * ( (float)(7) ) )) ) )) ) ) + ( (nvfuser_index_t)(( ( (int64_t)(( float(2.5) * ( (float)(7) ) )) ) )) ) ) ) ), 128) ), 1) )}, iUS19{1}, ithreadIdx.x17{128} ] ca_pos( 2 ) produce_pos( 3 )
= Set( T2_l_float[ iblockIdx.x24{( ceilDiv(( ceilDiv(( 1 * ( ( ( i1 + ( (nvfuser_index_t)(( ( (int64_t)(( float(2.5) * ( (float)(7) ) )) ) )) ) ) + ( (nvfuser_index_t)(( ( (int64_t)(( float(2.5) * ( (float)(7) ) )) ) )) ) ) * ( ( i2 + ( (nvfuser_index_t)(( ( (int64_t)(( float(2.5) * ( (float)(7) ) )) ) )) ) ) + ( (nvfuser_index_t)(( ( (int64_t)(( float(2.5) * ( (float)(7) ) )) ) )) ) ) ) ), 128) ), 1) )}, iUS25{1}, ithreadIdx.x23{128} ] ca_pos( 3 ), cache_op=Streaming )
i13
is inline printed as (nvfuser_index_t)(( ( (int64_t)(( float(2.5) * ( (float)(7) ) )) )
, its definition is
f6 = (float)(7);
f9 = float(2.5) * f6;
i11 = (int64_t)(f9);
i13
= Set( i11, cache_op=Streaming )
https://github.com/NVIDIA/Fuser/blob/main/csrc/ir/nodes.cpp#L2275-L2277
set
is allowed to have a scalar, soLoadStoreOp
should not assume its input and output are tensors. This command, for example, fails:NVFUSER_DUMP=fusion_ir_math pytest -s -v tests/python/test_python_frontend.py -k test_pad_dynamic