Runtime failure for the DGEMM application

juhigupta0 commented 6 months ago

I am working on an example which I am trying to integrate and execute using the sycl-bench suite. I am getting runtime error upon testing it with AdaptiveCpp for CUDA as well as HIP backend. When the same kernel is executed as the standalone ACpp application, the kernel executes without any error. I have a similar kind of behavior with one of my other applications. One of the similarities between both the applications is that they both are nd_range parallel_for type.

Matrices are currently initialized as identity matrix. Please let me know if you have any idea on the error.

Application code:

#include "common.h"

#include <iostream>
#include <sycl/sycl.hpp>

using namespace sycl;

class MatMulBlocked
{
protected:    

  BenchmarkArgs args;
  std::vector<double> initA;
  std::vector<double> initB;
  std::vector<double> initC;

  PrefetchedBuffer<double, 2> initA_buf;
  PrefetchedBuffer<double, 2> initB_buf;
  PrefetchedBuffer<double, 2> initC_buf;

  const size_t problem_size = 256;
  const size_t Ndim = 256;
  const size_t Mdim = 256;
  const size_t Pdim = 256;
  const size_t Bsize = 16;

public:
  MatMulBlocked(const BenchmarkArgs &_args) : args(_args) {}

  void setup() {
    // host memory intilization
    initA.resize(Ndim * Pdim);
    initB.resize(Pdim * Mdim);
    initC.resize(Ndim * Mdim);

    // Initialize matrix A to the identity
    for(size_t i = 0; i < Ndim; ++i) {
      for(size_t j = 0; j < Pdim; ++j) {
        initA[i * Pdim + j] = i == j;
          }
    }
      // Initialize matrix B to the identity
    for(size_t i = 0; i < Pdim; ++i) {
      for(size_t j = 0; j < Mdim; ++j) {
              initB[i * Mdim + j] = i == j;
          }
    }
      // Initialize matrix C to the zero
    for(size_t i = 0; i < Ndim; ++i) {
      for(size_t j = 0; j < Mdim; ++j) {
              initC[i * Mdim + j] = 0;
          }
    }

    initA_buf.initialize(args.device_queue, initA.data(), range<2>(Ndim, Pdim));
    initB_buf.initialize(args.device_queue, initB.data(), range<2>(Pdim, Mdim));
    initC_buf.initialize(args.device_queue, initC.data(), range<2>(Ndim, Mdim));
  }

  void run(std::vector<event>& events) {
    events.push_back(args.device_queue.submit(
        [&](handler& cgh) {

      auto in1 = initA_buf.template get_access<access::mode::read>(cgh);
      auto in2 = initB_buf.template get_access<access::mode::read>(cgh);
      auto out = initC_buf.template get_access<access::mode::read_write>(cgh);

      // Use local memory address space for local memory
      accessor<double, 2, access_mode::read_write, access::target::local> Awrk({Bsize, Bsize}, cgh);
      accessor<double, 2, access_mode::read_write, access::target::local> Bwrk({Bsize, Bsize}, cgh);

      cgh.parallel_for<class SYCL_Matmul_blocked_kernel>(
        nd_range<2>{{Ndim, Mdim}, {Bsize, Bsize}}, 
          [=](nd_item<2> idx) {
            // This work-item will compute C(i,j)
            const size_t i = idx.get_global_id(0);
            const size_t j = idx.get_global_id(1);

            // Element C(i,j) is in block C(Iblk, Jblk)
            const size_t Iblk = idx.get_group(0);
            const size_t Jblk = idx.get_group(1);

            // C(i,j) is element C(iloc, jloc) of block C(Iblk, Jblk)
            const size_t iloc = idx.get_local_id(0);
            const size_t jloc = idx.get_local_id(1);

            // Number of blocks
            const size_t Nblk = Ndim / Bsize;
            const size_t Mblk = Mdim / Bsize;
            const size_t Pblk = Pdim / Bsize;

            for (size_t Kblk = 0; Kblk < Pblk; ++Kblk) {
              // Copy A and B into local memory
              Awrk[iloc][jloc] = in1[Iblk * Bsize + iloc][Kblk * Bsize + jloc];
              Bwrk[iloc][jloc] = in2[Kblk * Bsize + iloc][Jblk * Bsize + jloc];

              // Compute matmul for block
              for (size_t kloc = 0; kloc < Bsize; ++kloc) {
                out[i][j] += Awrk[iloc][kloc] * Bwrk[kloc][jloc];
              }
            }
        });
        args.device_queue.wait_and_throw();

    }));

  }

  bool verify(VerificationSetting &ver) {
    //Triggers writeback
    initC_buf.reset();
    bool pass = true;

    for (size_t i = 0; i < problem_size; ++i) {
        for (size_t j = 0; j < problem_size; ++j) {
            auto kernel_value = initC[i * Mdim + j];
            auto host_value = (i == j) ? 1.0 : 0.0;

            if (kernel_value != host_value) {
                pass = false;
                break;
            }
        }
    }    
    return pass;
  }

  static std::string getBenchmarkName() {
    std::stringstream name;
    if(kernel_type_thorin)
      name << "Thorin_DGEMM_MatMulBlocked_";
    else
      name << "DGEMM_MatMulBlocked_";    
    return name.str();
  }
};

int main(int argc, char** argv)
{
  BenchmarkApp app(argc, argv);
  app.run<MatMulBlocked>();
  return 0;
}

Error logs when offloading to CUDA device:


********** Results for DGEMM_MatMulBlocked_**********
problem-size: 3072
local-size: 256
device-name: NVIDIA GeForce RTX 2080 SUPER
sycl-implementation: hipSYCL
============== hipSYCL error report ============== 
hipSYCL has caught the following undhandled asynchronous errors: 

   0. from /home/jgupta/development/opensycl/OpenSYCL/OpenSYCL_juhi/src/runtime/cuda/cuda_event.cpp:63 @ wait(): cuda_node_event: cudaEventSynchronize() failed (error code = CUDA:700)
The application will now be terminated.
terminate called without an active exception
zsh: IOT instruction (core dumped)

Error logs when offloading to HIP device:


********** Results for DGEMM_MatMulBlocked_**********
problem-size: 3072
local-size: 256
device-name: AMD Radeon Pro VII
sycl-implementation: hipSYCL
Memory access fault by GPU node-1 (Agent handle: 0x56096774a7e0) on address 0x7ffe8126f000. Reason: Page not present or supervisor privilege.
zsh: IOT instruction (core dumped)

Luigi-Crisci commented 6 months ago

You are accessing member variables captured by value [=] in the kernel. C++ tries to access them through the this pointer, which results in an illegal memory accesss. To fix this, you can either:

Explicitly capture the variable in the lambda (e.g. [Ndim_ = Ndim, ...](...))
Declare your variables as static constexpr. This would make the optimizer propagate their value and avoid any memory accesses.

juhigupta0 commented 6 months ago

Thank you for pointing it out. Much appreciated.

unisa-hpc / sycl-bench

Runtime failure for the DGEMM application #80