Closed certik closed 10 months ago
Did you check out the submodule when cloning the loopy repo? (e.g., git submodule update --init
).
Ah, I didn't! After checking out the submodule, everything works now:
$ loopy matmul.floopy
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
#define LOOPY_CALL_WITH_INTEGER_TYPES(MACRO_NAME) \
MACRO_NAME(int8, char) \
MACRO_NAME(int16, short) \
MACRO_NAME(int32, int) \
MACRO_NAME(int64, long)
#define LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \
inline TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \
{ \
if (a<0) \
a = a - (b-1); \
return a/b; \
}
LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B)
#undef LOOPY_DEFINE_FLOOR_DIV_POS_B
#undef LOOPY_CALL_WITH_INTEGER_TYPES
__kernel void __attribute__ ((reqd_work_group_size(8, 16, 1))) dgemm(int const m, int const n, int const l, double const alpha, __global double const *__restrict__ a, __global double const *__restrict__ b, __global double *__restrict__ c)
{
__local double a_acc_0[16 * 32];
__local double b_acc_0[32 * 8];
if (-1 + l >= 0)
for (int k_outer = 0; k_outer <= -1 + loopy_floor_div_pos_b_int32(31 + l, 32); ++k_outer)
{
barrier(CLK_LOCAL_MEM_FENCE) /* for a_acc_0 (a_acc rev-depends on insn0) */;
if (-1 + -1 * lid(0) + -16 * gid(0) + m >= 0 && -1 + -32 * k_outer + -1 * lid(1) + l >= 0)
for (int i2_outer = 0; i2_outer <= ((-32 + l + -32 * k_outer >= 0) ? 1 : -1 + -1 * lid(1) + -2 * k_outer + (15 + l + 15 * lid(1)) / 16); ++i2_outer)
for (int i1_outer = 0; i1_outer <= ((-16 + m + -16 * gid(0) >= 0) ? 1 : -1 + -1 * lid(0) + -2 * gid(0) + (7 + m + 7 * lid(0)) / 8); ++i1_outer)
a_acc_0[32 * (8 * i1_outer + lid(0)) + 16 * i2_outer + lid(1)] = a[16 * gid(0) + 8 * i1_outer + lid(0) + m * (32 * k_outer + 16 * i2_outer + lid(1))];
if (-1 + -32 * k_outer + -1 * lid(0) + l >= 0 && -1 + -1 * lid(1) + -8 * gid(1) + n >= 0 && 7 + -1 * lid(1) >= 0)
for (int i1_outer_0 = 0; i1_outer_0 <= ((-33 + l + -32 * k_outer >= 0) ? 3 : -1 + -1 * lid(0) + -4 * k_outer + (7 + l + 7 * lid(0)) / 8); ++i1_outer_0)
b_acc_0[8 * (8 * i1_outer_0 + lid(0)) + lid(1)] = b[32 * k_outer + 8 * i1_outer_0 + lid(0) + l * (8 * gid(1) + lid(1))];
barrier(CLK_LOCAL_MEM_FENCE) /* for a_acc_0 (insn0 depends on a_acc) */;
if (-1 + -1 * lid(0) + -8 * gid(1) + n >= 0 && -1 + -1 * lid(1) + -16 * gid(0) + m >= 0)
for (int k_inner = 0; k_inner <= ((-32 + l + -32 * k_outer >= 0) ? 31 : -1 + l + -32 * k_outer); ++k_inner)
c[16 * gid(0) + lid(1) + m * (8 * gid(1) + lid(0))] = c[16 * gid(0) + lid(1) + m * (8 * gid(1) + lid(0))] + alpha * b_acc_0[8 * k_inner + lid(0)] * a_acc_0[32 * lid(1) + k_inner];
}
}%
Thanks @matthiasdiener !
I am using the latest loopy, which I installed into a Conda environment using "pip install .", I also had to install "fparser":
It seems to happen with the C target as well: