microsoft / nnfusion

A flexible and efficient deep neural network (DNN) compiler that generates high-performance executable from a DNN model description.
MIT License
959 stars 163 forks source link

[BUG] GENERIC_CPU backend can not generate model code with Antares kernels #350

Closed xysmlx closed 2 years ago

xysmlx commented 2 years ago

🐛 Bug

-fdefault_device=GENERIC_CPU -fantares_mode=true can not generate model code with Antares kernels. The reason is that AntaresCpuKernelEmitter::get_or_emit_source() return nullptr due to code format mismatch.

    // extract kernel code
    const char* s_func_pattern = "// [thread_compute]\n";
    const char* e_func_pattern = "\n}\n";
    const char* s_rank_pattern = "__rank__ = ";
    const char* e_rank_pattern = "\n";
    std::string::size_type s_func_pos = antares_code.find(s_func_pattern);
    std::string::size_type e_func_pos = antares_code.rfind(e_func_pattern);

    if (s_func_pos != std::string::npos || e_func_pos != std::string::npos)
        return nullptr;
extern "C" void template_op_kernel0(const int __rank__, void** __args) {
  auto * input0 = (float* __restrict)__args[0]; auto * input1 = (float* __restrict)__args[1]; auto * output0 = (float*)__args[2];
  using namespace std;

  // [thread_extent] __rank__ = 8
  for (int N0_outer_inner = 0; N0_outer_inner < 8; ++N0_outer_inner) {
    for (int N1_outer_inner = 0; N1_outer_inner < 32; ++N1_outer_inner) {
      for (int N2_outer_inner = 0; N2_outer_inner < 2; ++N2_outer_inner) {
        for (int N0_inner = 0; N0_inner < 2; ++N0_inner) {
          for (int N1_inner = 0; N1_inner < 4; ++N1_inner) {
            for (int N2_inner = 0; N2_inner < 7; ++N2_inner) {
              for (int N3_inner = 0; N3_inner < 7; ++N3_inner) {
                output0[((((((((((((((int)__rank__) >> 2) * 802816) + (N0_outer_inner * 100352)) + (N0_inner * 50176)) + (((((int)__rank__) & 3) >> 1) * 25088)) + (N1_outer_inner * 784)) + (N1_inner * 196)) + (N2_outer_inner * 98)) + (N2_inner * 14)) + ((((int)__rank__) & 1) * 7)) + N3_inner))] = (input0[((((((((((((((int)__rank__) >> 2) * 802816) + (N0_outer_inner * 100352)) + (N0_inner * 50176)) + (((((int)__rank__) & 3) >> 1) * 25088)) + (N1_outer_inner * 784)) + (N1_inner * 196)) + (N2_outer_inner * 98)) + (N2_inner * 14)) + ((((int)__rank__) & 1) * 7)) + N3_inner))] + input1[((((((((((((((int)__rank__) >> 2) * 802816) + (N0_outer_inner * 100352)) + (N0_inner * 50176)) + (((((int)__rank__) & 3) >> 1) * 25088)) + (N1_outer_inner * 784)) + (N1_inner * 196)) + (N2_outer_inner * 98)) + (N2_inner * 14)) + ((((int)__rank__) & 1) * 7)) + N3_inner))]);
              }
            }
          }
        }
      }
    }
  }
}