inducer / loopy

A code generator for array-based code on CPUs and GPUs
http://mathema.tician.de/software/loopy
MIT License
565 stars 71 forks source link

matmul.floopy doesn't seem to work #805

Closed certik closed 10 months ago

certik commented 10 months ago

I am using the latest loopy, which I installed into a Conda environment using "pip install .", I also had to install "fparser":

$ loopy matmul.floopy 
Traceback (most recent call last):
  File "/Users/ondrej/mambaforge/envs/loopy/bin/loopy", line 5, in <module>
    loopy.cli.main()
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/cli.py", line 196, in main
    cgr = lp.generate_code_v2(t_unit)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/codegen/__init__.py", line 629, in generate_code_v2
    cgr = generate_code_for_a_single_kernel(program[func_id],
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/codegen/__init__.py", line 416, in generate_code_for_a_single_kernel
    codegen_result = generate_host_or_device_program(
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/codegen/result.py", line 339, in generate_host_or_device_program
    codegen_result = build_loop_nest(codegen_state, schedule_index)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/codegen/control.py", line 203, in build_loop_nest
    inner = generate_code_for_sched_index(codegen_state, schedule_index)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/codegen/control.py", line 54, in generate_code_for_sched_index
    codegen_result = generate_host_or_device_program(
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/codegen/result.py", line 323, in generate_host_or_device_program
    temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/target/c/__init__.py", line 902, in get_temporary_decls
    decl = self.get_temporary_var_declarator(codegen_state, tv)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/target/c/__init__.py", line 1084, in get_temporary_var_declarator
    temp_var_decl = self.get_array_base_declarator(temp_var)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/target/opencl.py", line 753, in get_array_base_declarator
    return POD(self, dtype, ary.name)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/target/c/__init__.py", line 277, in __init__
    self.ctype = ast_builder.target.dtype_to_typename(dtype)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/target/c/__init__.py", line 442, in dtype_to_typename
    return self.get_dtype_registry().dtype_to_ctype(dtype)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/pytools/__init__.py", line 777, in wrapper
    result = function(obj, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/target/opencl.py", line 576, in get_dtype_registry
    from loopy.target.c.compyte.dtypes import (DTypeRegistry,
ModuleNotFoundError: No module named 'loopy.target.c.compyte'

It seems to happen with the C target as well:

$ loopy --target=c matmul.floopy   
Traceback (most recent call last):
  File "/Users/ondrej/mambaforge/envs/loopy/bin/loopy", line 5, in <module>
    loopy.cli.main()
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/cli.py", line 196, in main
    cgr = lp.generate_code_v2(t_unit)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/codegen/__init__.py", line 629, in generate_code_v2
    cgr = generate_code_for_a_single_kernel(program[func_id],
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/codegen/__init__.py", line 416, in generate_code_for_a_single_kernel
    codegen_result = generate_host_or_device_program(
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/codegen/result.py", line 339, in generate_host_or_device_program
    codegen_result = build_loop_nest(codegen_state, schedule_index)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/codegen/control.py", line 203, in build_loop_nest
    inner = generate_code_for_sched_index(codegen_state, schedule_index)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/codegen/control.py", line 54, in generate_code_for_sched_index
    codegen_result = generate_host_or_device_program(
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/codegen/result.py", line 323, in generate_host_or_device_program
    temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/target/c/__init__.py", line 902, in get_temporary_decls
    decl = self.get_temporary_var_declarator(codegen_state, tv)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/target/c/__init__.py", line 1084, in get_temporary_var_declarator
    temp_var_decl = self.get_array_base_declarator(temp_var)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/target/c/__init__.py", line 1004, in get_array_base_declarator
    arg_decl = POD(self, ary.dtype, ary.name)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/target/c/__init__.py", line 277, in __init__
    self.ctype = ast_builder.target.dtype_to_typename(dtype)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/target/c/__init__.py", line 442, in dtype_to_typename
    return self.get_dtype_registry().dtype_to_ctype(dtype)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/pytools/__init__.py", line 777, in wrapper
    result = function(obj, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ondrej/mambaforge/envs/loopy/lib/python3.11/site-packages/loopy/target/c/__init__.py", line 1345, in get_dtype_registry
    from loopy.target.c.compyte.dtypes import (
ModuleNotFoundError: No module named 'loopy.target.c.compyte'
matthiasdiener commented 10 months ago

Did you check out the submodule when cloning the loopy repo? (e.g., git submodule update --init).

certik commented 10 months ago

Ah, I didn't! After checking out the submodule, everything works now:

$ loopy matmul.floopy 
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
#define LOOPY_CALL_WITH_INTEGER_TYPES(MACRO_NAME) \
    MACRO_NAME(int8, char) \
    MACRO_NAME(int16, short) \
    MACRO_NAME(int32, int) \
    MACRO_NAME(int64, long)
#define LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \
    inline TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \
    { \
        if (a<0) \
            a = a - (b-1); \
        return a/b; \
    }
LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B)
#undef LOOPY_DEFINE_FLOOR_DIV_POS_B
#undef LOOPY_CALL_WITH_INTEGER_TYPES

__kernel void __attribute__ ((reqd_work_group_size(8, 16, 1))) dgemm(int const m, int const n, int const l, double const alpha, __global double const *__restrict__ a, __global double const *__restrict__ b, __global double *__restrict__ c)
{
  __local double a_acc_0[16 * 32];
  __local double b_acc_0[32 * 8];

  if (-1 + l >= 0)
    for (int k_outer = 0; k_outer <= -1 + loopy_floor_div_pos_b_int32(31 + l, 32); ++k_outer)
    {
      barrier(CLK_LOCAL_MEM_FENCE) /* for a_acc_0 (a_acc rev-depends on insn0) */;
      if (-1 + -1 * lid(0) + -16 * gid(0) + m >= 0 && -1 + -32 * k_outer + -1 * lid(1) + l >= 0)
        for (int i2_outer = 0; i2_outer <= ((-32 + l + -32 * k_outer >= 0) ? 1 : -1 + -1 * lid(1) + -2 * k_outer + (15 + l + 15 * lid(1)) / 16); ++i2_outer)
          for (int i1_outer = 0; i1_outer <= ((-16 + m + -16 * gid(0) >= 0) ? 1 : -1 + -1 * lid(0) + -2 * gid(0) + (7 + m + 7 * lid(0)) / 8); ++i1_outer)
            a_acc_0[32 * (8 * i1_outer + lid(0)) + 16 * i2_outer + lid(1)] = a[16 * gid(0) + 8 * i1_outer + lid(0) + m * (32 * k_outer + 16 * i2_outer + lid(1))];
      if (-1 + -32 * k_outer + -1 * lid(0) + l >= 0 && -1 + -1 * lid(1) + -8 * gid(1) + n >= 0 && 7 + -1 * lid(1) >= 0)
        for (int i1_outer_0 = 0; i1_outer_0 <= ((-33 + l + -32 * k_outer >= 0) ? 3 : -1 + -1 * lid(0) + -4 * k_outer + (7 + l + 7 * lid(0)) / 8); ++i1_outer_0)
          b_acc_0[8 * (8 * i1_outer_0 + lid(0)) + lid(1)] = b[32 * k_outer + 8 * i1_outer_0 + lid(0) + l * (8 * gid(1) + lid(1))];
      barrier(CLK_LOCAL_MEM_FENCE) /* for a_acc_0 (insn0 depends on a_acc) */;
      if (-1 + -1 * lid(0) + -8 * gid(1) + n >= 0 && -1 + -1 * lid(1) + -16 * gid(0) + m >= 0)
        for (int k_inner = 0; k_inner <= ((-32 + l + -32 * k_outer >= 0) ? 31 : -1 + l + -32 * k_outer); ++k_inner)
          c[16 * gid(0) + lid(1) + m * (8 * gid(1) + lid(0))] = c[16 * gid(0) + lid(1) + m * (8 * gid(1) + lid(0))] + alpha * b_acc_0[8 * k_inner + lid(0)] * a_acc_0[32 * lid(1) + k_inner];
    }
}%                                                                                                                                              

Thanks @matthiasdiener !