Open johanneslenfers opened 2 years ago
"execute convolution" actually fails for me.
result.runtime.isRight was false
Ah, mm fails silently in the test. One problem is that loop unrolling needs to insert an if-guard, as the loop may be taken once, or never, depending on runtime values. I've tried to enable generating an if-guard in https://github.com/rise-lang/shine/tree/issue222, codegen now works but not execution.
Generated kernel, not sure what's going wrong there, the code is pretty massive:
"__kernel __attribute__ ((reqd_work_group_size(16, 16, 1)))"
"void k0(global float* restrict output, int n205, int n204, int n206, const global float* restrict e207, const global float* restrict e208, local struct Record_32_4_float_32_256_float* restrict x1550){"
" /* Start of moved local vars */"
" /* End of moved local vars */"
" /* mapWorkGroup */"
" for (int wg_id_1614 = get_group_id(1); wg_id_1614 < (n204 / 4); wg_id_1614 = 16 + wg_id_1614) {"
" /* mapWorkGroup */"
" for (int wg_id_1615 = get_group_id(0); wg_id_1615 < (n205 / 256); wg_id_1615 = 64 + wg_id_1615) {"
" /* oclReduceSeq */"
" {"
" float x1401[256];"
" /* mapLocal */"
" /* unrolling loop of 1 */"
" if (get_local_id(1) < 4) {"
" /* mapLocal */"
" /* unrolling loop of 4 */"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[4 * get_local_id(0)] = 0.0f;"
" x1401[1 + (4 * get_local_id(0))] = 0.0f;"
" x1401[2 + (4 * get_local_id(0))] = 0.0f;"
" x1401[3 + (4 * get_local_id(0))] = 0.0f;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[64 + (4 * get_local_id(0))] = 0.0f;"
" x1401[65 + (4 * get_local_id(0))] = 0.0f;"
" x1401[66 + (4 * get_local_id(0))] = 0.0f;"
" x1401[67 + (4 * get_local_id(0))] = 0.0f;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[128 + (4 * get_local_id(0))] = 0.0f;"
" x1401[129 + (4 * get_local_id(0))] = 0.0f;"
" x1401[130 + (4 * get_local_id(0))] = 0.0f;"
" x1401[131 + (4 * get_local_id(0))] = 0.0f;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[192 + (4 * get_local_id(0))] = 0.0f;"
" x1401[193 + (4 * get_local_id(0))] = 0.0f;"
" x1401[194 + (4 * get_local_id(0))] = 0.0f;"
" x1401[195 + (4 * get_local_id(0))] = 0.0f;"
" }"
" else {"
" /* skip */"
" }"
" "
" for (int i_1616 = 0; i_1616 < (n206 / 32); i_1616 = 1 + i_1616) {"
" /* mapLocal */"
" for (int l_id_1617 = get_local_id(1); l_id_1617 < 2; l_id_1617 = 16 + l_id_1617) {"
" /* mapLocal */"
" /* iteration count is exactly 1, no loop emitted */"
" int l_id_1618 = get_local_id(0);"
" vstore4(vload4(0, &e207[(((4 * wg_id_1614) + ((16 * l_id_1617) * n204)) + ((32 * i_1616) * n204)) + (l_id_1618 * n204)]), 0, &x1550[0]._fst[(4 * l_id_1618) + (64 * l_id_1617)]);"
" }"
" "
" /* mapLocal */"
" for (int l_id_1619 = get_local_id(1); l_id_1619 < 32; l_id_1619 = 16 + l_id_1619) {"
" /* mapLocal */"
" for (int l_id_1620 = get_local_id(0); l_id_1620 < 64; l_id_1620 = 16 + l_id_1620) {"
" vstore4(vload4(0, &e208[(((4 * l_id_1620) + ((32 * i_1616) * n205)) + (256 * wg_id_1615)) + (l_id_1619 * n205)]), 0, &x1550[0]._snd[(4 * l_id_1620) + (256 * l_id_1619)]);"
" }"
" "
" }"
" "
" barrier(CLK_LOCAL_MEM_FENCE);"
" /* mapLocal */"
" /* unrolling loop of 1 */"
" if (get_local_id(1) < 4) {"
" /* mapLocal */"
" /* unrolling loop of 4 */"
" /* oclReduceSeq */"
" {"
" float x1454[4];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1401[4 * get_local_id(0)];"
" x1454[1] = x1401[1 + (4 * get_local_id(0))];"
" x1454[2] = x1401[2 + (4 * get_local_id(0))];"
" x1454[3] = x1401[3 + (4 * get_local_id(0))];"
" for (int i_1621 = 0; i_1621 < 32; i_1621 = 1 + i_1621) {"
" {"
" struct Record_1_float_4_float x1504;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" x1504._fst[0] = x1550[0]._fst[(4 * i_1621) + get_local_id(1)];"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1504._snd[0] = x1550[0]._snd[(4 * get_local_id(0)) + (256 * i_1621)];"
" x1504._snd[1] = x1550[0]._snd[(1 + (4 * get_local_id(0))) + (256 * i_1621)];"
" x1504._snd[2] = x1550[0]._snd[(2 + (4 * get_local_id(0))) + (256 * i_1621)];"
" x1504._snd[3] = x1550[0]._snd[(3 + (4 * get_local_id(0))) + (256 * i_1621)];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
" x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
" x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
" x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
" }"
" "
" }"
" "
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[4 * get_local_id(0)] = x1454[0];"
" x1401[1 + (4 * get_local_id(0))] = x1454[1];"
" x1401[2 + (4 * get_local_id(0))] = x1454[2];"
" x1401[3 + (4 * get_local_id(0))] = x1454[3];"
" }"
" "
" /* oclReduceSeq */"
" {"
" float x1454[4];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1401[64 + (4 * get_local_id(0))];"
" x1454[1] = x1401[65 + (4 * get_local_id(0))];"
" x1454[2] = x1401[66 + (4 * get_local_id(0))];"
" x1454[3] = x1401[67 + (4 * get_local_id(0))];"
" for (int i_1622 = 0; i_1622 < 32; i_1622 = 1 + i_1622) {"
" {"
" struct Record_1_float_4_float x1504;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" x1504._fst[0] = x1550[0]._fst[(4 * i_1622) + get_local_id(1)];"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1504._snd[0] = x1550[0]._snd[(64 + (4 * get_local_id(0))) + (256 * i_1622)];"
" x1504._snd[1] = x1550[0]._snd[(65 + (4 * get_local_id(0))) + (256 * i_1622)];"
" x1504._snd[2] = x1550[0]._snd[(66 + (4 * get_local_id(0))) + (256 * i_1622)];"
" x1504._snd[3] = x1550[0]._snd[(67 + (4 * get_local_id(0))) + (256 * i_1622)];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
" x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
" x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
" x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
" }"
" "
" }"
" "
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[64 + (4 * get_local_id(0))] = x1454[0];"
" x1401[65 + (4 * get_local_id(0))] = x1454[1];"
" x1401[66 + (4 * get_local_id(0))] = x1454[2];"
" x1401[67 + (4 * get_local_id(0))] = x1454[3];"
" }"
" "
" /* oclReduceSeq */"
" {"
" float x1454[4];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1401[128 + (4 * get_local_id(0))];"
" x1454[1] = x1401[129 + (4 * get_local_id(0))];"
" x1454[2] = x1401[130 + (4 * get_local_id(0))];"
" x1454[3] = x1401[131 + (4 * get_local_id(0))];"
" for (int i_1623 = 0; i_1623 < 32; i_1623 = 1 + i_1623) {"
" {"
" struct Record_1_float_4_float x1504;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" x1504._fst[0] = x1550[0]._fst[(4 * i_1623) + get_local_id(1)];"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1504._snd[0] = x1550[0]._snd[(128 + (4 * get_local_id(0))) + (256 * i_1623)];"
" x1504._snd[1] = x1550[0]._snd[(129 + (4 * get_local_id(0))) + (256 * i_1623)];"
" x1504._snd[2] = x1550[0]._snd[(130 + (4 * get_local_id(0))) + (256 * i_1623)];"
" x1504._snd[3] = x1550[0]._snd[(131 + (4 * get_local_id(0))) + (256 * i_1623)];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
" x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
" x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
" x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
" }"
" "
" }"
" "
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[128 + (4 * get_local_id(0))] = x1454[0];"
" x1401[129 + (4 * get_local_id(0))] = x1454[1];"
" x1401[130 + (4 * get_local_id(0))] = x1454[2];"
" x1401[131 + (4 * get_local_id(0))] = x1454[3];"
" }"
" "
" /* oclReduceSeq */"
" {"
" float x1454[4];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1401[192 + (4 * get_local_id(0))];"
" x1454[1] = x1401[193 + (4 * get_local_id(0))];"
" x1454[2] = x1401[194 + (4 * get_local_id(0))];"
" x1454[3] = x1401[195 + (4 * get_local_id(0))];"
" for (int i_1624 = 0; i_1624 < 32; i_1624 = 1 + i_1624) {"
" {"
" struct Record_1_float_4_float x1504;"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" x1504._fst[0] = x1550[0]._fst[(4 * i_1624) + get_local_id(1)];"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1504._snd[0] = x1550[0]._snd[(192 + (4 * get_local_id(0))) + (256 * i_1624)];"
" x1504._snd[1] = x1550[0]._snd[(193 + (4 * get_local_id(0))) + (256 * i_1624)];"
" x1504._snd[2] = x1550[0]._snd[(194 + (4 * get_local_id(0))) + (256 * i_1624)];"
" x1504._snd[3] = x1550[0]._snd[(195 + (4 * get_local_id(0))) + (256 * i_1624)];"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
" x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
" x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
" x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
" }"
" "
" }"
" "
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 4 */"
" x1401[192 + (4 * get_local_id(0))] = x1454[0];"
" x1401[193 + (4 * get_local_id(0))] = x1454[1];"
" x1401[194 + (4 * get_local_id(0))] = x1454[2];"
" x1401[195 + (4 * get_local_id(0))] = x1454[3];"
" }"
" "
" }"
" else {"
" /* skip */"
" }"
" "
" barrier(CLK_LOCAL_MEM_FENCE);"
" }"
" "
" /* mapLocal */"
" /* unrolling loop of 1 */"
" if (get_local_id(1) < 4) {"
" /* mapLocal */"
" /* unrolling loop of 4 */"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" vstore4((float4)(x1401[4 * get_local_id(0)], x1401[1 + (4 * get_local_id(0))], x1401[2 + (4 * get_local_id(0))], x1401[3 + (4 * get_local_id(0))]), 0, &output[((((4 * n205) * wg_id_1614) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" vstore4((float4)(x1401[64 + (4 * get_local_id(0))], x1401[65 + (4 * get_local_id(0))], x1401[66 + (4 * get_local_id(0))], x1401[67 + (4 * get_local_id(0))]), 0, &output[(((64 + ((4 * n205) * wg_id_1614)) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" vstore4((float4)(x1401[128 + (4 * get_local_id(0))], x1401[129 + (4 * get_local_id(0))], x1401[130 + (4 * get_local_id(0))], x1401[131 + (4 * get_local_id(0))]), 0, &output[(((128 + ((4 * n205) * wg_id_1614)) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" /* mapSeq */"
" /* unrolling loop of 1 */"
" vstore4((float4)(x1401[192 + (4 * get_local_id(0))], x1401[193 + (4 * get_local_id(0))], x1401[194 + (4 * get_local_id(0))], x1401[195 + (4 * get_local_id(0))]), 0, &output[(((192 + ((4 * n205) * wg_id_1614)) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
" }"
" else {"
" /* skip */"
" }"
" "
" }"
" "
" }"
" "
" }"
" "
"}"
"";
The mm is very heavy for non-gpu devices, so it's allowed to have an EXECUTION_ERROR
. Btw. the assert
to check that is missing here. If the codegen works it should be fine.
The mm is very heavy for non-gpu devices, so it's allowed to have an
EXECUTION_ERROR
. Btw. theassert
to check that is missing here. If the codegen works it should be fine.
The execution succeeds on a RTX 2070.
Problem
Use of private memory fails for some valid parameter configurations for the mm nvidia version if private opencl address space is used. Used to work in older versions. The performance is slightly worse if local address space is used.
Reproduce
Change lines in expression. Then, execute this test. The first parameter configuration will fail if the private version is used.
Output and Performance (RTX 2070)
Local
Private
Todo
Find out reason for
assertion failed
- error.