rise-lang / shine

The Shine compiler for the RISE language
https://rise-lang.org
MIT License
73 stars 8 forks source link

Use of opencl local address space instead of private in nvidia mm version #222

Open johanneslenfers opened 2 years ago

johanneslenfers commented 2 years ago

Problem

Use of private memory fails for some valid parameter configurations for the mm nvidia version if private opencl address space is used. Used to work in older versions. The performance is slightly worse if local address space is used.

                // FIXME: there seems to be a bug in AdjustArraySizesForAllocations
//                oclReduceSeq(AddressSpace.Private)(fun((p13, p14) =>
                  oclReduceSeq(AddressSpace.Local)(fun((p13, p14) =>

Reproduce

Change lines in expression. Then, execute this test. The first parameter configuration will fail if the private version is used.

Output and Performance (RTX 2070)

Local

 1. 6.081472 ms
 2. 5.368928 ms

Private

1. CODE_GENERATION_ERROR - assertion failed
2. 4.872192 ms

Todo

Find out reason for assertion failed - error.

Bastacyclop commented 2 years ago

"execute convolution" actually fails for me. image result.runtime.isRight was false

Bastacyclop commented 2 years ago

Ah, mm fails silently in the test. One problem is that loop unrolling needs to insert an if-guard, as the loop may be taken once, or never, depending on runtime values. I've tried to enable generating an if-guard in https://github.com/rise-lang/shine/tree/issue222, codegen now works but not execution.

Generated kernel, not sure what's going wrong there, the code is pretty massive:

"__kernel __attribute__ ((reqd_work_group_size(16, 16, 1)))"
"void k0(global float* restrict output, int n205, int n204, int n206, const global float* restrict e207, const global float* restrict e208, local struct Record_32_4_float_32_256_float* restrict x1550){"
"  /* Start of moved local vars */"
"  /* End of moved local vars */"
"  /* mapWorkGroup */"
"  for (int wg_id_1614 = get_group_id(1); wg_id_1614 < (n204 / 4); wg_id_1614 = 16 + wg_id_1614) {"
"    /* mapWorkGroup */"
"    for (int wg_id_1615 = get_group_id(0); wg_id_1615 < (n205 / 256); wg_id_1615 = 64 + wg_id_1615) {"
"      /* oclReduceSeq */"
"      {"
"        float x1401[256];"
"        /* mapLocal */"
"        /* unrolling loop of 1 */"
"        if (get_local_id(1) < 4) {"
"          /* mapLocal */"
"          /* unrolling loop of 4 */"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 4 */"
"          x1401[4 * get_local_id(0)] = 0.0f;"
"          x1401[1 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[2 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[3 + (4 * get_local_id(0))] = 0.0f;"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 4 */"
"          x1401[64 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[65 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[66 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[67 + (4 * get_local_id(0))] = 0.0f;"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 4 */"
"          x1401[128 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[129 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[130 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[131 + (4 * get_local_id(0))] = 0.0f;"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 4 */"
"          x1401[192 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[193 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[194 + (4 * get_local_id(0))] = 0.0f;"
"          x1401[195 + (4 * get_local_id(0))] = 0.0f;"
"        }"
"         else {"
"          /* skip */"
"        }"
"        "
"        for (int i_1616 = 0; i_1616 < (n206 / 32); i_1616 = 1 + i_1616) {"
"          /* mapLocal */"
"          for (int l_id_1617 = get_local_id(1); l_id_1617 < 2; l_id_1617 = 16 + l_id_1617) {"
"            /* mapLocal */"
"            /* iteration count is exactly 1, no loop emitted */"
"            int l_id_1618 = get_local_id(0);"
"            vstore4(vload4(0, &e207[(((4 * wg_id_1614) + ((16 * l_id_1617) * n204)) + ((32 * i_1616) * n204)) + (l_id_1618 * n204)]), 0, &x1550[0]._fst[(4 * l_id_1618) + (64 * l_id_1617)]);"
"          }"
"          "
"          /* mapLocal */"
"          for (int l_id_1619 = get_local_id(1); l_id_1619 < 32; l_id_1619 = 16 + l_id_1619) {"
"            /* mapLocal */"
"            for (int l_id_1620 = get_local_id(0); l_id_1620 < 64; l_id_1620 = 16 + l_id_1620) {"
"              vstore4(vload4(0, &e208[(((4 * l_id_1620) + ((32 * i_1616) * n205)) + (256 * wg_id_1615)) + (l_id_1619 * n205)]), 0, &x1550[0]._snd[(4 * l_id_1620) + (256 * l_id_1619)]);"
"            }"
"            "
"          }"
"          "
"          barrier(CLK_LOCAL_MEM_FENCE);"
"          /* mapLocal */"
"          /* unrolling loop of 1 */"
"          if (get_local_id(1) < 4) {"
"            /* mapLocal */"
"            /* unrolling loop of 4 */"
"            /* oclReduceSeq */"
"            {"
"              float x1454[4];"
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1454[0] = x1401[4 * get_local_id(0)];"
"              x1454[1] = x1401[1 + (4 * get_local_id(0))];"
"              x1454[2] = x1401[2 + (4 * get_local_id(0))];"
"              x1454[3] = x1401[3 + (4 * get_local_id(0))];"
"              for (int i_1621 = 0; i_1621 < 32; i_1621 = 1 + i_1621) {"
"                {"
"                  struct Record_1_float_4_float x1504;"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  x1504._fst[0] = x1550[0]._fst[(4 * i_1621) + get_local_id(1)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1504._snd[0] = x1550[0]._snd[(4 * get_local_id(0)) + (256 * i_1621)];"
"                  x1504._snd[1] = x1550[0]._snd[(1 + (4 * get_local_id(0))) + (256 * i_1621)];"
"                  x1504._snd[2] = x1550[0]._snd[(2 + (4 * get_local_id(0))) + (256 * i_1621)];"
"                  x1504._snd[3] = x1550[0]._snd[(3 + (4 * get_local_id(0))) + (256 * i_1621)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
"                  x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
"                  x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
"                  x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
"                }"
"                "
"              }"
"              "
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1401[4 * get_local_id(0)] = x1454[0];"
"              x1401[1 + (4 * get_local_id(0))] = x1454[1];"
"              x1401[2 + (4 * get_local_id(0))] = x1454[2];"
"              x1401[3 + (4 * get_local_id(0))] = x1454[3];"
"            }"
"            "
"            /* oclReduceSeq */"
"            {"
"              float x1454[4];"
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1454[0] = x1401[64 + (4 * get_local_id(0))];"
"              x1454[1] = x1401[65 + (4 * get_local_id(0))];"
"              x1454[2] = x1401[66 + (4 * get_local_id(0))];"
"              x1454[3] = x1401[67 + (4 * get_local_id(0))];"
"              for (int i_1622 = 0; i_1622 < 32; i_1622 = 1 + i_1622) {"
"                {"
"                  struct Record_1_float_4_float x1504;"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  x1504._fst[0] = x1550[0]._fst[(4 * i_1622) + get_local_id(1)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1504._snd[0] = x1550[0]._snd[(64 + (4 * get_local_id(0))) + (256 * i_1622)];"
"                  x1504._snd[1] = x1550[0]._snd[(65 + (4 * get_local_id(0))) + (256 * i_1622)];"
"                  x1504._snd[2] = x1550[0]._snd[(66 + (4 * get_local_id(0))) + (256 * i_1622)];"
"                  x1504._snd[3] = x1550[0]._snd[(67 + (4 * get_local_id(0))) + (256 * i_1622)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
"                  x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
"                  x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
"                  x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
"                }"
"                "
"              }"
"              "
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1401[64 + (4 * get_local_id(0))] = x1454[0];"
"              x1401[65 + (4 * get_local_id(0))] = x1454[1];"
"              x1401[66 + (4 * get_local_id(0))] = x1454[2];"
"              x1401[67 + (4 * get_local_id(0))] = x1454[3];"
"            }"
"            "
"            /* oclReduceSeq */"
"            {"
"              float x1454[4];"
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1454[0] = x1401[128 + (4 * get_local_id(0))];"
"              x1454[1] = x1401[129 + (4 * get_local_id(0))];"
"              x1454[2] = x1401[130 + (4 * get_local_id(0))];"
"              x1454[3] = x1401[131 + (4 * get_local_id(0))];"
"              for (int i_1623 = 0; i_1623 < 32; i_1623 = 1 + i_1623) {"
"                {"
"                  struct Record_1_float_4_float x1504;"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  x1504._fst[0] = x1550[0]._fst[(4 * i_1623) + get_local_id(1)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1504._snd[0] = x1550[0]._snd[(128 + (4 * get_local_id(0))) + (256 * i_1623)];"
"                  x1504._snd[1] = x1550[0]._snd[(129 + (4 * get_local_id(0))) + (256 * i_1623)];"
"                  x1504._snd[2] = x1550[0]._snd[(130 + (4 * get_local_id(0))) + (256 * i_1623)];"
"                  x1504._snd[3] = x1550[0]._snd[(131 + (4 * get_local_id(0))) + (256 * i_1623)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
"                  x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
"                  x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
"                  x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
"                }"
"                "
"              }"
"              "
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1401[128 + (4 * get_local_id(0))] = x1454[0];"
"              x1401[129 + (4 * get_local_id(0))] = x1454[1];"
"              x1401[130 + (4 * get_local_id(0))] = x1454[2];"
"              x1401[131 + (4 * get_local_id(0))] = x1454[3];"
"            }"
"            "
"            /* oclReduceSeq */"
"            {"
"              float x1454[4];"
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1454[0] = x1401[192 + (4 * get_local_id(0))];"
"              x1454[1] = x1401[193 + (4 * get_local_id(0))];"
"              x1454[2] = x1401[194 + (4 * get_local_id(0))];"
"              x1454[3] = x1401[195 + (4 * get_local_id(0))];"
"              for (int i_1624 = 0; i_1624 < 32; i_1624 = 1 + i_1624) {"
"                {"
"                  struct Record_1_float_4_float x1504;"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  x1504._fst[0] = x1550[0]._fst[(4 * i_1624) + get_local_id(1)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1504._snd[0] = x1550[0]._snd[(192 + (4 * get_local_id(0))) + (256 * i_1624)];"
"                  x1504._snd[1] = x1550[0]._snd[(193 + (4 * get_local_id(0))) + (256 * i_1624)];"
"                  x1504._snd[2] = x1550[0]._snd[(194 + (4 * get_local_id(0))) + (256 * i_1624)];"
"                  x1504._snd[3] = x1550[0]._snd[(195 + (4 * get_local_id(0))) + (256 * i_1624)];"
"                  /* mapSeq */"
"                  /* unrolling loop of 1 */"
"                  /* mapSeq */"
"                  /* unrolling loop of 4 */"
"                  x1454[0] = x1454[0] + (x1504._fst[0] * x1504._snd[0]);"
"                  x1454[1] = x1454[1] + (x1504._fst[0] * x1504._snd[1]);"
"                  x1454[2] = x1454[2] + (x1504._fst[0] * x1504._snd[2]);"
"                  x1454[3] = x1454[3] + (x1504._fst[0] * x1504._snd[3]);"
"                }"
"                "
"              }"
"              "
"              /* mapSeq */"
"              /* unrolling loop of 1 */"
"              /* mapSeq */"
"              /* unrolling loop of 4 */"
"              x1401[192 + (4 * get_local_id(0))] = x1454[0];"
"              x1401[193 + (4 * get_local_id(0))] = x1454[1];"
"              x1401[194 + (4 * get_local_id(0))] = x1454[2];"
"              x1401[195 + (4 * get_local_id(0))] = x1454[3];"
"            }"
"            "
"          }"
"           else {"
"            /* skip */"
"          }"
"          "
"          barrier(CLK_LOCAL_MEM_FENCE);"
"        }"
"        "
"        /* mapLocal */"
"        /* unrolling loop of 1 */"
"        if (get_local_id(1) < 4) {"
"          /* mapLocal */"
"          /* unrolling loop of 4 */"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          vstore4((float4)(x1401[4 * get_local_id(0)], x1401[1 + (4 * get_local_id(0))], x1401[2 + (4 * get_local_id(0))], x1401[3 + (4 * get_local_id(0))]), 0, &output[((((4 * n205) * wg_id_1614) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          vstore4((float4)(x1401[64 + (4 * get_local_id(0))], x1401[65 + (4 * get_local_id(0))], x1401[66 + (4 * get_local_id(0))], x1401[67 + (4 * get_local_id(0))]), 0, &output[(((64 + ((4 * n205) * wg_id_1614)) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          vstore4((float4)(x1401[128 + (4 * get_local_id(0))], x1401[129 + (4 * get_local_id(0))], x1401[130 + (4 * get_local_id(0))], x1401[131 + (4 * get_local_id(0))]), 0, &output[(((128 + ((4 * n205) * wg_id_1614)) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          /* mapSeq */"
"          /* unrolling loop of 1 */"
"          vstore4((float4)(x1401[192 + (4 * get_local_id(0))], x1401[193 + (4 * get_local_id(0))], x1401[194 + (4 * get_local_id(0))], x1401[195 + (4 * get_local_id(0))]), 0, &output[(((192 + ((4 * n205) * wg_id_1614)) + (4 * get_local_id(0))) + (256 * wg_id_1615)) + (n205 * get_local_id(1))]);"
"        }"
"         else {"
"          /* skip */"
"        }"
"        "
"      }"
"      "
"    }"
"    "
"  }"
"  "
"}"
"";
johanneslenfers commented 2 years ago

The mm is very heavy for non-gpu devices, so it's allowed to have an EXECUTION_ERROR. Btw. the assert to check that is missing here. If the codegen works it should be fine.

johanneslenfers commented 2 years ago

The mm is very heavy for non-gpu devices, so it's allowed to have an EXECUTION_ERROR. Btw. the assert to check that is missing here. If the codegen works it should be fine.

The execution succeeds on a RTX 2070.