Open zasdfgbnm opened 1 year ago
Generated code:
__global__ void kernel1(Tensor<float, 3> T0, Tensor<float, 2> T1) {
NVFUSER_DEFINE_MAGIC_ZERO
int i108;
i108 = (((nvfuser_index_t)blockIdx.x) * ((nvfuser_index_t)blockDim.x)) + ((nvfuser_index_t)threadIdx.x);
int i50;
i50 = ((((nvfuser_index_t)blockIdx.x) * ((nvfuser_index_t)blockDim.x)) + ((nvfuser_index_t)threadIdx.x)) / T0.size[2];
int i51;
i51 = ((((nvfuser_index_t)blockIdx.x) * ((nvfuser_index_t)blockDim.x)) + ((nvfuser_index_t)threadIdx.x)) % T0.size[2];
float T2[1];
if ((i108 < (T0.size[0] * T0.size[2]))) {
T2[0]
= T0[(i50 * T0.stride[0]) + i51];
} else {
if ((i108 < (T0.size[0] * T0.size[2]))) {
T2[0]
= T0[(i50 * T0.stride[0]) + i51];
}
}
float T4[1];
T4[0] = 0.00000000000000000e+00;
#pragma unroll 1
for(nvfuser_index_t i38 = 0; i38 < (ceilDiv((ceilDiv(i1, 4)), 1)); ++i38) {
if (((i108 < (T0.size[0] * T0.size[2])) && (((i38 * 4) + 3) < i1))) {
#pragma unroll
for(nvfuser_index_t i40 = 0; i40 < 4; ++i40) {
T4[0]
= T4[0]
+ T2[0];
}
NVFUSER_UPDATE_MAGIC_ZERO
} else {
#pragma unroll
for(nvfuser_index_t i40 = 0; i40 < 4; ++i40) {
if (((i108 < (T0.size[0] * T0.size[2])) && (((i38 * 4) + (i40 + nvfuser_zero)) < i1))) {
T4[0]
= T4[0]
+ T2[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO
}
}
float T3[1];
T3[0] = 0.00000000000000000e+00;
T3[0]
= T3[0]
+ T4[0];
if ((i108 < (T0.size[0] * T0.size[2]))) {
T1[i108]
= T3[0];
}
}
🐛 Describe the bug
error
Versions
devel