cudaErrorNoKernelImageForDevice

waredjeb commented 5 years ago

Hello, I'm trying to port a simple code for GPU-CUDARt. There is a simple matrix multiplication using the Eigen library. Well, I can compile but I got the following error during the execution: terminate called after throwing an instance of 'std::runtime_error' what(): /data/user/wredjeb/cupla/alpaka/include/alpaka/mem/buf/cuda/Copy.hpp(861) 'cudaSetDevice( iDstDev)' A previous CUDA call (not this one) set the error : 'cudaErrorNoKernelImageForDevice': 'no kernel image is available for execution on the device'! Actually the error raises only when I enqueue the task (alpaka::queue::enqueue(queue, TaskKernelGpuCudaRt);), but in both cases it seems that the kernel doesn't work since it returns a final matrix with only zeros. In the following the kernel and the piece of header that I use in the kernel to fill and print the matrices. HEADER

template <class C>
host device void printIt(C* m) {

  printf("\nMatrix %dx%d\n", (int)m->rows(), (int)m->cols());
  for (u_int r = 0; r < m->rows(); ++r) {
    for (u_int c = 0; c < m->cols(); ++c) {
      printf("Matrix(%d,%d) = %f\n", r, c, (*m)(r, c));
    }
  }
}

template <typename T>
void fillMatrix(T& t) {
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_real_distribution<> dis(0.0, 2.0);
  for (int row = 0; row < t.rows(); ++row) {
    for (int col = 0; col < t.cols(); ++col) {
      t(row, col) = dis(gen);
    }
  }
  return;
}

Multiplication KERNEL


template <typename M1, typename M2, typename M3>
struct kernelMultiply {
    template< typename T_Acc>
    ALPAKA_FN_ACC
    void operator()(T_Acc const & acc, M1 *J, M2 *C, M3 *result) const{
#if TEST_DEBUG
      printf("*** GPU IN ***\n");
#endif
//      printIt(J);
//      printIt(C);
      //  res.noalias() = (*J) * (*C);
      //  printIt(&res);
      (*result) = (*J) * (*C);    
#if TEST_DEBUG
    printf("*** GPU OUT ***\n");
#endif
    return;
    }//end operator
};//end kernelMultiply

Function that calls the kernel

template <int row1, int col1, int row2, int col2>
void testMultiply(){
    printf("TEST MULTIPLY");
    printf("Product of type %i x %i * %i x %i",row1,col1,row2,col2);

    using Dim = alpaka::dim::DimInt<1u>;
    using Idx = std::size_t;

    using Acc = alpaka::acc::AccGpuCudaRt<Dim,Idx>;
    //using ComputeStream = alpaka::stream::StreamCudaRtSync;
    using Queue = alpaka::queue::QueueCudaRtSync;

    using Dev = alpaka::dev::Dev<Acc>;
    using Pltf = alpaka::pltf::Pltf<Dev>; 

     Dev const devAcc(alpaka::pltf::getDevByIdx<Pltf>(0u));

     Queue queue(devAcc);    

    using Vec = alpaka::vec::Vec<Dim, Idx>;

    Idx const numElements(1);
    Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
    Vec const threadsPerBlock(Vec::all(static_cast<Idx>(1)));
    Vec const blocksPerGrid(static_cast<Idx>(1));

    using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Idx>;
    WorkDiv const workDiv(
        blocksPerGrid,
        threadsPerBlock,
        elementsPerThread);

    // Idx const gridThreadIdx(alpaka::idx::getIdx<workDiv, Idx>(devAcc)[0u]);

    using DataJ =  Matrix<double, row1, col1>;
    using DataC =  Matrix<double, row2, col2>;
    using DataR =  Matrix<double, row1, col2>;

    //Get the host device for allocating memory on the host.
    using DevHost = alpaka::dev::DevCpu;
    using PltfHost = alpaka::pltf::Pltf<DevHost>;
    DevHost const devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));

    //Allocate 3 host memory buffers
    using BufHostJ = alpaka::mem::buf::Buf<DevHost, DataJ, Dim, Idx>;
    using BufHostC = alpaka::mem::buf::Buf<DevHost, DataC, Dim, Idx>;
    using BufHostR = alpaka::mem::buf::Buf<DevHost, DataR, Dim, Idx>;

    BufHostJ bufHostJ(alpaka::mem::buf::alloc<DataJ,Idx>(devHost, numElements));
    BufHostC bufHostC(alpaka::mem::buf::alloc<DataC,Idx>(devHost, numElements));
    BufHostR bufHostResult(alpaka::mem::buf::alloc<DataR,Idx>(devHost, numElements));

    DataJ* const pBufHostJ(alpaka::mem::view::getPtrNative(bufHostJ));
    DataC* const pBufHostC(alpaka::mem::view::getPtrNative(bufHostC));
    DataR* const pBufHostResult(alpaka::mem::view::getPtrNative(bufHostResult));

    //Fill matrices
    fillMatrix(*pBufHostJ);
    fillMatrix(*pBufHostC);
    //fillMatrix(*pBufHostResult);

printIt(pBufHostJ);
    printIt(pBufHostC);
    printIt(pBufHostResult);

    // Allocate 3 buffers on the accelerator
    using BufAccJ = alpaka::mem::buf::Buf<Dev, DataJ, Dim, Idx>;
    using BufAccC = alpaka::mem::buf::Buf<Dev, DataC, Dim, Idx>;
    using BufAccR = alpaka::mem::buf::Buf<Dev, DataR, Dim, Idx>;

    BufAccJ bufAccJ(alpaka::mem::buf::alloc<DataJ, Idx>(devAcc, elementsPerThread));
    BufAccC bufAccC(alpaka::mem::buf::alloc<DataC, Idx>(devAcc, elementsPerThread));
    BufAccR bufAccR(alpaka::mem::buf::alloc<DataR, Idx>(devAcc, elementsPerThread));

    // copy from Host to Acc
    alpaka::mem::view::copy(queue, bufAccJ, bufHostJ, elementsPerThread);
    alpaka::mem::view::copy(queue, bufAccC, bufHostC, elementsPerThread);
    alpaka::mem::view::copy(queue, bufAccR, bufHostResult, elementsPerThread);   

    //Instantiate the kernel function object
    kernelMultiply<DataJ, DataC, DataR> kernel;

    //alpaka::exec::ExecGpuCudaRt<Dim,Idx,kernel>;

    auto const TaskKernelGpuCudaRt(alpaka::kernel::createTaskKernel<Acc>(
        workDiv,
        kernel,
        alpaka::mem::view::getPtrNative(bufAccJ),
        alpaka::mem::view::getPtrNative(bufAccC),
        alpaka::mem::view::getPtrNative(bufAccR)));

    alpaka::queue::enqueue(queue, TaskKernelGpuCudaRt);  

    alpaka::mem::view::copy(queue,bufHostResult,bufAccR,elementsPerThread);

    printIt(pBufHostResult);

MAIN

auto main() ->int
{
    testMultiply<2, 2, 2, 2>();
    return EXIT_SUCCESS;
}

Error

./bin/GPUCudatestEigenAlpaka 
TEST MULTIPLY
Product of type 2 x 2 * 2 x 2
Matrix 2x2
Matrix(0,0) = 0.738374
Matrix(0,1) = 0.972253
Matrix(1,0) = 1.047253
Matrix(1,1) = 0.981868

Matrix 2x2
Matrix(0,0) = 0.636357
Matrix(0,1) = 1.613794
Matrix(1,0) = 1.880317
Matrix(1,1) = 0.490757

Matrix 2x2
Matrix(0,0) = 0.000000
Matrix(0,1) = 0.000000
Matrix(1,0) = 0.000000
Matrix(1,1) = 0.000000
terminate called after throwing an instance of 'std::runtime_error'
  what():  /data/user/wredjeb/cupla/alpaka/include/alpaka/mem/buf/cuda/Copy.hpp(861) 'cudaSetDevice( iDstDev)' A previous CUDA call (not this one) set the error  : 'cudaErrorNoKernelImageForDevice': 'no kernel image is available for execution on the device'!

I'm using the develop branch of alpaka. This is my first approach with alpaka, what am I doing wrong?

sbastrakov commented 5 years ago

@waredjeb thanks for your report. Not sure if relevant to the issue, but just in case can you please provide the OS, compiler, cuda version, and CMake options used for alpaka? Edit: using the develop branch is correct.

waredjeb commented 5 years ago

@sbastrakov thanks for the quick reply. I'm on CentOS 7.6, cuda10.1. Actually I'm compiling with nvcc and gcc, gcc version 8.3.1. Compilation flags: CXXFLAGS="-m64 -std=c++14 -g -O2 -DALPAKA_DEBUG=0 -I$CUDA_ROOT/include -I$ALPAKA_ROOT/include HOST_FLAGS="-fopenmp -pthread -fPIC -ftemplate-depth-512 -Wall -Wextra -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-local-typedefs -Wno-attributes -Wno-reorder -Wno-sign-compare" NVCC_FLAGS="-ccbin $CXX -w -lineinfo --expt-extended-lambda --expt-relaxed-constexpr --generate-code arch=compute_50,code=sm_50 --use_fast_math --ftz=false --cudart shared"

sbastrakov commented 5 years ago

I am not sure this way enables the CUDA backend of alpaka. We would normally enable it via cmake configuration. If this is indeed the issue (which I am not sure about), including <alpaka/standalone/GpuCudaRt.hpp> before other alpaka includes might help.

cc @psychocoderHPC @BenjaminW3 you probably have a better idea than I do.

BenjaminW3 commented 5 years ago

The error description: "This indicates that there is no kernel image available that is suitable for the device. This can occur when a user specifies code generation options for a particular CUDA source file that do not include the corresponding device configuration." From this decription my guess would be that the GPU that was selected via alpaka::pltf::getDevByIdx<Pltf>(0u) does not support sm_50 given on the command line. @waredjeb Can you run CMake again with -DALPAKA_DEBUG=2? This should add extended traces to the output (and make everything slower) so that we can see what GPU was selected?

waredjeb commented 5 years ago

@BenjaminW3 here the output:

TEST MULTIPLYProduct of type 2 x 2 * 2 x 2[+] getDevByIdx
[+] getDevCount
[-] getDevCount
[+] printDeviceProperties
name: Tesla T4
totalGlobalMem: 15079 MiB
sharedMemPerBlock: 48 KiB
regsPerBlock: 65536
warpSize: 32
memPitch: 2147483647 B
maxThreadsPerBlock: 1024
maxThreadsDim[3]: (1024, 1024, 64)
maxGridSize[3]: (2147483647, 65535, 65535)
clockRate: 1590000 kHz
totalConstMem: 64 KiB
major: 7
minor: 5
textureAlignment: 512
texturePitchAlignment: 32
multiProcessorCount: 40
kernelExecTimeoutEnabled: 0
integrated: 0
canMapHostMemory: 1
computeMode: 0
maxTexture1D: 131072
maxTexture1DLinear: 134217728
maxTexture2D[2]: 131072x65536
maxTexture2DLinear[3]: 131072x65000x2097120
maxTexture2DGather[2]: 32768x32768
maxTexture3D[3]: 16384x16384x16384
maxTextureCubemap: 32768
maxTexture1DLayered[2]: 32768x2048
maxTexture2DLayered[3]: 32768x32768x2048
maxTextureCubemapLayered[2]: 32768x2046
maxSurface1D: 32768
maxSurface2D[2]: 131072x65536
maxSurface3D[3]: 16384x16384x16384
maxSurface1DLayered[2]: 32768x2048
maxSurface2DLayered[3]: 32768x32768x2048
maxSurfaceCubemap: 32768
maxSurfaceCubemapLayered[2]: 32768x2046
surfaceAlignment: 512
concurrentKernels: 1
ECCEnabled: 1
pciBusID: 2
pciDeviceID: 0
pciDomainID: 0
tccDriver: 0
asyncEngineCount: 3
unifiedAddressing: 1
memoryClockRate: 5001000 kHz
memoryBusWidth: 256 b
l2CacheSize: 4194304 B
maxThreadsPerMultiProcessor: 1024
[-] printDeviceProperties
[-] getDevByIdx
[+] QueueCudaRtAsyncImpl
[-] QueueCudaRtAsyncImpl
[+] getDevByIdx
[+] getDevCount
[-] getDevCount
[-] getDevByIdx
[+] alloc
[+] BufCpuImpl
BufCpuImpl e: (1) ptr: 0x28e2780 pitch: 32
[-] BufCpuImpl
[-] alloc
[+] alloc
[+] BufCpuImpl
BufCpuImpl e: (1) ptr: 0x28e2800 pitch: 32
[-] BufCpuImpl
[-] alloc
[+] alloc
[+] BufCpuImpl
BufCpuImpl e: (1) ptr: 0x28e2880 pitch: 32
[-] BufCpuImpl
[-] alloc

Matrix 2x2
Matrix(0,0) = 0.814927
Matrix(0,1) = 1.884702
Matrix(1,0) = 1.981513
Matrix(1,1) = 1.664239

Matrix 2x2
Matrix(0,0) = 1.058731
Matrix(0,1) = 1.997586
Matrix(1,0) = 0.550919
Matrix(1,1) = 0.873779

Matrix 2x2
Matrix(0,0) = 0.000000
Matrix(0,1) = 0.000000
Matrix(1,0) = 0.000000
Matrix(1,1) = 0.000000
[+] alloc
alloc ew: 1 ewb: 32 ptr: 0x7fd5d7000000
[+] BufCudaRt
[-] BufCudaRt
[-] alloc
[+] alloc
alloc ew: 1 ewb: 32 ptr: 0x7fd5d7000200
[+] BufCudaRt
[-] BufCudaRt
[-] alloc
[+] alloc
alloc ew: 1 ewb: 32 ptr: 0x7fd5d7000400
[+] BufCudaRt
[-] BufCudaRt
[-] alloc
[+] createTaskCopy
[-] createTaskCopy
[+] enqueue
printDebug ddev: 0 ew: 1 ewb: 32 dw: 1 dptr: 0x7fd5d7000000 sdev: 0 sw: 1 sptr: 0x28e2780
[-] enqueue
[+] createTaskCopy
[-] createTaskCopy
[+] enqueue
printDebug ddev: 0 ew: 1 ewb: 32 dw: 1 dptr: 0x7fd5d7000200 sdev: 0 sw: 1 sptr: 0x28e2800
[-] enqueue
[+] createTaskCopy
[-] createTaskCopy
[+] enqueue
printDebug ddev: 0 ew: 1 ewb: 32 dw: 1 dptr: 0x7fd5d7000400 sdev: 0 sw: 1 sptr: 0x28e2880
[-] enqueue
createTaskKernel gridBlockExtent: (1), blockThreadExtent: (1)
[+] enqueue
enqueue gridDim: 1 1 1 blockDim: 1 1 1
enqueue BlockSharedMemDynSizeBytes: 0 B
enqueue binaryVersion: 0 constSizeBytes: 0 B localSizeBytes: 0 B maxThreadsPerBlock: 0 numRegs: 0 ptxVersion: 0 sharedSizeBytes: 0 B
/data/user/wredjeb/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp(375) 'cudaSetDevice( queue.m_spQueueImpl->m_dev.m_iDevice)' A previous CUDA call (not this one) set the error  : 'cudaErrorInvalidDeviceFunction': 'invalid device function'!
Illegal instruction

BenjaminW3 commented 5 years ago

Where does the sm_50 on the command line come from? Are you explicitly setting this when calling CMake? Could you try to use -DALPAKA_CUDA_ARCH=75?

waredjeb commented 5 years ago

-DALPAKA_CUDA_ARCH=75 worked! Thanks!

BenjaminW3 commented 5 years ago

Thanks, I will close this now. Keep in mind that each GPU requires a different architecture, so if you change the system this runs on you have to change the CUDA device architecture. You can also compile for multiple architectures by using -DALPAKA_CUDA_ARCH="50;70;75". This allows the compiled code to run on all those architectures but this will also increase the compilation time.

alpaka-group / alpaka

cudaErrorNoKernelImageForDevice #859