clBLAS fails when using multiple contexts

pavanky commented 8 years ago

The code to reproduce the problem is here:

#include <stdio.h>
#include <iostream>
#include <vector>

/* Include the clBLAS header. It includes the appropriate OpenCL headers */
#include <clBLAS.h>

/* This example uses predefined matrices and their characteristics for
 * simplicity purpose.
 */

#define M  4
#define N  3
#define K  5

static const cl_float alpha = 1;

static const cl_float A[M*K] = {
    11, 12, 13, 14, 15,
    21, 22, 23, 24, 25,
    31, 32, 33, 34, 35,
    41, 42, 43, 44, 45,
};
static const size_t lda = K;        /* i.e. lda = K */

static const cl_float B[K*N] = {
    11, 12, 13,
    21, 22, 23,
    31, 32, 33,
    41, 42, 43,
    51, 52, 53,
};
static const size_t ldb = N;        /* i.e. ldb = N */

static const cl_float beta = 0;

static cl_float C[M*N] = {
    11, 12, 13,
    21, 22, 23,
    31, 32, 33,
    41, 42, 43,
};
static const size_t ldc = N;        /* i.e. ldc = N */

static cl_float result[M*N];

void print(const char *msg, const cl_float *ptr, int m, int n)
{
    std::cout << msg << std::endl;
    for(int i = 0; i < n; i++) {
        for(int j = 0; j < m; j++) {
            std::cout << ptr[i * m + j] << "\t";
        }
        std::cout << std::endl;
    }
}

#define ERR() do {      \
    if(err != 0) {      \
        printf("%d Error = %d\n", __LINE__, err);   \
        exit(err);      \
    }                   \
} while(0);

int func(cl_device_id &device, cl_context_properties *props)
{
    cl_int err;
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufB, bufC;
    cl_event event = NULL;
    int ret = 0;

    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
    queue = clCreateCommandQueue( ctx, device, 0, &err );

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
            NULL, &err );
    ERR();
    bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
            NULL, &err );
    ERR();
    bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
            NULL, &err );
    ERR();

    err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
            M * K * sizeof( *A ), A, 0, NULL, NULL );
    ERR();
    err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
            K * N * sizeof( *B ), B, 0, NULL, NULL );
    ERR();
    err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
            M * N * sizeof( *C ), C, 0, NULL, NULL );
    ERR();

    print("A", A, K, M);
    print("B", B, N, K);

    /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
    err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
            M, N, K,
            alpha, bufA, 0, lda,
            bufB, 0, ldb, beta,
            bufC, 0, ldc,
            1, &queue, 0, NULL, &event );
    ERR();

    /* Wait for calculations to be finished. */
    err = clWaitForEvents( 1, &event );
    ERR();

    /* Fetch results of calculations from GPU memory. */
    err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
            M * N * sizeof(*result),
            result, 0, NULL, NULL );
    ERR();

    print("R", result, N, M);

    /* Release OpenCL memory objects. */
    clReleaseMemObject( bufC );
    clReleaseMemObject( bufB );
    clReleaseMemObject( bufA );

    /* Release OpenCL working objects. */
    clReleaseCommandQueue( queue );
    clReleaseContext( ctx );

    return ret;
}

int main()
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_uint ndevices = 0;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs( 1, &platform, NULL );
    ERR();
    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 0, NULL, &ndevices );
    ERR();

    std::vector<cl_device_id> devices(ndevices);

    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, ndevices, &devices.front(), NULL );
    ERR();

    props[1] = (cl_context_properties)platform;

    err = clblasSetup( );
    ERR();

    // This bug happens when atleast 2 contexts are created.
    // We can use either multiple gpus if available
    // Or create 2 contexts for the same device
    int runs = ndevices > 1 ? ndevices : 2;
    for(int idx = 0; idx < runs; idx++) {
        int id = idx % ndevices;
        cl_device_id device = devices[id];
        char name[256];
        clGetDeviceInfo(device, CL_DEVICE_NAME, 256, name, NULL);
        printf("Device %d = %s\n", id, name);

        func(device, props);
    }

    /* Finalize work with clBLAS */
    clblasTeardown( );

    return 0;
}

TimmyLiu commented 8 years ago

problem is probably at kernel caching code assuming single context/device scenario.

makeGemmKernel()

only checks if clKernel is built. But the kernel can be built for another context or device.

shehzan10 commented 8 years ago

We tested with gemv and that is working fine. Is there anything special in gemm and are there any other functions that use the same mechanism as gemm?

TimmyLiu commented 8 years ago

gemm and trsm since trsm calls gemm.

pavanky commented 8 years ago

@TimmyLiu If it is just those two files (xgemm.cc and xtrsm.cc), I think I can send in a patch that solves the issue in a couple of days. I am assuming there is no restriction against c++11 in clBLAS ?

TimmyLiu commented 8 years ago

@pavanky I think all the major compiler we use support c++ 11. xgemm.cc and xtrsm.cc and GemmSpecialCases.cpp call makeGemmKernel(). They are all using some static kernels, either generated by python or checked in into the repo.

hughperkins commented 8 years ago

On Windows, python 2.7 needs Visual Studio 2008. python 3.4 Visual Studio 2010. Both of these support c++0x, but not c++11. strong preference to use c++0x as the standard.

TimmyLiu commented 8 years ago

I was able to use python 2.7 with VS 2015. @hughperkins are you saying vs 2010 would not work with python 2.7?

hughperkins commented 8 years ago

Ok, so, if all you want to do is use python as your generator, in clBLAS itself, then compiler options are irrelevant.

However, my DeepCL project:

can be used from python
therefore, links with python (at runtime, as a .dll)
must be built using the same compiler as python was built with

Python 2.7 was built with Visual Studio 2008. Python 3.4 was built with Visual Studio 2010. Any project that wishes to be imported into python, and used from python, must be buildable using the exact same compiler.

Edit: example of a python script that loads DeepCL, using import: https://github.com/hughperkins/DeepCL/blob/master/python/test_deepcl.py#L5

kknox commented 8 years ago

@hughperkins clBLAS only uses python at build time, not at runtime. It's a tool which generates cl kernels and c++ host code for us, which are built into the clblas library (excluding the python).

On a separate topic (which I don't think you asked about, but I make note of just in case), python wrappers can be built for clBLAS as an interface to python. A proof of concept for gemm was created using cython. I don't remember anymore what requirements cython imposes for client python scripts, but I'm sure we would have the same minimum requirements as pyopencl.

pavanky commented 8 years ago

@kknox I do not want to derail the topic too much, but I've found ctypes to be fairly straight forward when wrapping a C interface. You just load the clBLAS library and call the appropriate symbol / function. There is also cffi module which might provide a bit more safety than ctypes

Going this way reduces the required dependencies (i.e. no reason to use cython).

hughperkins commented 8 years ago

DeepCL can be used from Python, and is imported from python, therefore needs to be built using the exact same compiler as python, on Windows.

Since DeepCL links with clBLAS, this means that deepcl needs to build clblas with the exact same compiler as python, on Windows.

On Windows, python 2.7 is built with visual studio 2008, and python 3.4 is built with visual studio 2010. Therefore, deepcl, and therefore by extension clblas, should be buildable on windows using both visual studio 2008 and visual studio 2010.

Visual studio 2010 supports C++0x, but does NOT support c++11.

Strong preference that clblas can continue to be compiled using c++0x.

kknox commented 8 years ago

@pavanky Do you know how well does cffi could integrate with pyopencl? I experimented with cython because I had seen other projects use cython and pyopencl together as a proof-point. Any kind of python wrapper for clBLAS needs to be able to accept pyopencl commandqueues, contexts and return pyopencl events.

Since DeepCL links with clBLAS, this means that deepcl needs to build clblas with the exact same compiler as python, on Windows.

@hughperkins I assume that the reason you care which compiler clBLAS is compiled with is because you care about the msvc runtime dependency and you link clblas statically. Are you restricted from linking clblas dynamically? You should be able to mix msvc runtimes then.

hughperkins commented 8 years ago

I'm linking dynamically. Mixing msvc libraries between dlls is a dangerous practice, which will lead to weird and unexplained behavior, when memory allocated on the heap is shared between the caller and the callee, and when structs created in caller or callee are passed to the other. You can see the following articles for more details:

If the library is very simple, not needing to pass arrays or structs between the caller <-> callee, then you can sometimes get away with mixing runtimes. Such very simple usage is not the case with clBLAS, which involves passing many structs, and involves a significant amount of allocating and freeing.

pavanky commented 8 years ago

I have fixed the issue in our fork without using C++11. I think we can stop discussing the topic now.

hughperkins commented 8 years ago

I have fixed the issue in our fork without using C++11.

Thank you Pavan

pavanky commented 8 years ago

I am closing this issue because it is resolved in develop.

hughperkins commented 8 years ago

This is a very important issue to have fixed :-)

hughperkins commented 8 years ago

Just to confirm, this is fixed in autogemm right? Not just for some of the non-generated kernels? (edited to say 'non-generated' rather than 'statically defined')

pavanky commented 8 years ago

yes, this is for the autogemm kernels.

hughperkins commented 8 years ago

Cool :-) Nice! :-)

clMathLibraries / clBLAS

clBLAS fails when using multiple contexts #197