clMathLibraries / clBLAS

a software library containing BLAS functions written in OpenCL
Apache License 2.0
843 stars 237 forks source link

Scasum shows undefined behavior after running Dzasum. #263

Closed mikhail-j closed 8 years ago

mikhail-j commented 8 years ago

I'm using clBLAS 2.10 with an AMD R9 390 GPU on Windows 7 x64. The issue occurs when I call the single precision complex absolute sum function (scasum) after double precision complex absolute sum function (dzasum). The following code is modified from example_sasum.c.

The following code gives me:
(123.00, 0.00) (123.00, 1.88).

Instead of the result I expected: (123.00, 0.00) (123.00, 0.00)

/* ************************************************************************
 * check complex
 * ************************************************************************/

#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

/* Include CLBLAS header. It automatically includes needed OpenCL header,
 * so we can drop out explicit inclusion of cl.h header.
 */
#include "clBLAS.h"

/* This example uses predefined matrices and their characteristics for
 * simplicity purpose.
 */
static const size_t N = 7;
static cl_double2 X[] = {
    {{1,0}},
    {{2,0}},
    {{-11,0}},
    {{17,0}},
    {{5,0}},
    {{6,0}},
    {{81,0}}
};
static const int incx = 1;
static cl_double2 asum;
static cl_float2 X2[] = {
    {{1,0}},
    {{2,0}},
    {{-11,0}},
    {{17,0}},
    {{5,0}},
    {{6,0}},
    {{81,0}}
};
static cl_float2 asum2;

int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufAsum, scratchBuff;
    cl_event event = NULL;
    int ret = 0;
    int lenX = 1 + (N-1)*abs(incx);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_double2)), NULL, &err);
    bufAsum = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_double2)), NULL, &err);
    // Allocate minimum of N elements
    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_double2)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_double2)), X, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasDzasum( N, bufAsum, 0, bufX, 0, incx, scratchBuff,
                                    1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSasum() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAsum, CL_TRUE, 0, sizeof(cl_double2),
                                    &asum, 0, NULL, NULL);
        //printf("Result : %f\n", asum);
        printf("(%9.2lf, %-9.2lf)\n", CREAL(asum), CIMAG(asum));
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufAsum);
    clReleaseMemObject(scratchBuff);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
///////////////////////////////////////////////now do single precision complex numbers
    //cl_int err;
    platform = 0;
    device = 0;
    //props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    ctx = 0;
    queue = 0;
    cl_mem bufX2, bufAsum2, scratchBuff2;
    event = NULL;
    //int ret = 0;
    //int lenX = 1 + (N-1)*abs(incx);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float2)), NULL, &err);
    bufAsum2 = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float2)), NULL, &err);
    // Allocate minimum of N elements
    scratchBuff2 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float2)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX2, CL_TRUE, 0, (lenX*sizeof(cl_float2)), X2, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasScasum( N, bufAsum2, 0, bufX2, 0, incx, scratchBuff2,
                                    1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSasum() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAsum2, CL_TRUE, 0, sizeof(cl_float2),
                                    &asum2, 0, NULL, NULL);
        //printf("Result : %f\n", asum);
        printf("(%9.2lf, %-9.2lf)\n", CREAL(asum2), CIMAG(asum2));
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX2);
    clReleaseMemObject(bufAsum2);
    clReleaseMemObject(scratchBuff2);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
mikhail-j commented 8 years ago

When I run dzasum after scasum, I get my expected result: (123.00, 0.00) (123.00, 0.00)

This is reflected in the following code.

/* ************************************************************************
 * check complex
 * ************************************************************************/

#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

/* Include CLBLAS header. It automatically includes needed OpenCL header,
 * so we can drop out explicit inclusion of cl.h header.
 */
#include "clBLAS.h"

/* This example uses predefined matrices and their characteristics for
 * simplicity purpose.
 */
static const size_t N = 7;
static cl_float2 X[] = {
    {{1,0}},
    {{2,0}},
    {{-11,0}},
    {{17,0}},
    {{5,0}},
    {{6,0}},
    {{81,0}}
};
static const int incx = 1;
static cl_float2 asum;
static cl_double2 X2[] = {
    {{1,0}},
    {{2,0}},
    {{-11,0}},
    {{17,0}},
    {{5,0}},
    {{6,0}},
    {{81,0}}
};
static cl_double2 asum2;

int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufAsum, scratchBuff;
    cl_event event = NULL;
    int ret = 0;
    int lenX = 1 + (N-1)*abs(incx);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float2)), NULL, &err);
    bufAsum = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float2)), NULL, &err);
    // Allocate minimum of N elements
    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float2)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float2)), X, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasScasum( N, bufAsum, 0, bufX, 0, incx, scratchBuff,
                                    1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSasum() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAsum, CL_TRUE, 0, sizeof(cl_float2),
                                    &asum, 0, NULL, NULL);
        //printf("Result : %f\n", asum);
        printf("(%9.2lf, %-9.2lf)\n", CREAL(asum), CIMAG(asum));
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufAsum);
    clReleaseMemObject(scratchBuff);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
///////////////////////////////////////////////now do single precision complex numbers
    //cl_int err;
    platform = 0;
    device = 0;
    //props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    ctx = 0;
    queue = 0;
    cl_mem bufX2, bufAsum2, scratchBuff2;
    event = NULL;
    //int ret = 0;
    //int lenX = 1 + (N-1)*abs(incx);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_double2)), NULL, &err);
    bufAsum2 = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_double2)), NULL, &err);
    // Allocate minimum of N elements
    scratchBuff2 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_double2)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX2, CL_TRUE, 0, (lenX*sizeof(cl_double2)), X2, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasDzasum( N, bufAsum2, 0, bufX2, 0, incx, scratchBuff2,
                                    1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSasum() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAsum2, CL_TRUE, 0, sizeof(cl_double2),
                                    &asum2, 0, NULL, NULL);
        //printf("Result : %f\n", asum);
        printf("(%9.2lf, %-9.2lf)\n", CREAL(asum2), CIMAG(asum2));
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX2);
    clReleaseMemObject(bufAsum2);
    clReleaseMemObject(scratchBuff2);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
mikhail-j commented 8 years ago

I've also tested my code samples with clBLAS 2.3.0 (packaged with acml 6.1.0.33 on Windows), revealing the same results.

tingxingdong commented 8 years ago

why do you clBLASSetup() and tear down separately for the two routines?

You only need setup and tear down once, between which, you call as many as clBLAS routines as you want.

On Mon, May 2, 2016 at 6:33 AM, Michael Jin notifications@github.com wrote:

I've also tested my code samples with clBLAS 2.3.0 (packaged with acml 6.1.0.33 on Windows), revealing the same results.

— You are receiving this because you are subscribed to this thread. Reply to this email directly or view it on GitHub https://github.com/clMathLibraries/clBLAS/issues/263#issuecomment-216211152

Tingxing dong

pavanky commented 8 years ago

@mikhail-j Scasum and Zdasum are supposed to return float and double respectively not cl_float2 and cl_double2. The second value you are allocating is not touched by the function call. Print the values before and after the function call. You will see that the value of the second element is unchanged.

mikhail-j commented 8 years ago

@pavanky, setting cl_float2 asum2 = {{0,0}} before running clReadEnqueueBuffer doesn't change the result (123.00, 1.88).

I guess its not a problem if we only want a float or double result. But I am curious, why does the second component of the float vector result as 1.88?

pavanky commented 8 years ago

@mikhail-j That's not what I meant. call clReadEnqueueBuffer before you call the clBLAS function to check what the second value is. The second value should stay the before and after you call the clBLAS function.

pavanky commented 8 years ago

@mikhail-j I think you are assuming all newly allocated memory will be zeros. This is not true. They can have any value they want. In your case, for some reason, the second value is always getting 1.88 and is never touched by clBLAS.

I think the issue should be closed.

mikhail-j commented 8 years ago

@pavanky, I've modified my sample code to read the asum buffer before I call Scasum BLAS function. The first 2 float components of bufAsum2 are ( 0.00, 0.00 ). I have copied the following code I used below.

/* ************************************************************************
 * check complex
 * ************************************************************************/

#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

/* Include CLBLAS header. It automatically includes needed OpenCL header,
 * so we can drop out explicit inclusion of cl.h header.
 */
#include "clBLAS.h"

/* This example uses predefined matrices and their characteristics for
 * simplicity purpose.
 */
static const size_t N = 7;
static cl_double2 X[] = {
    {{1,0}},
    {{2,0}},
    {{-11,0}},
    {{17,0}},
    {{5,0}},
    {{6,0}},
    {{81,0}}
};
static const int incx = 1;
static cl_double2 asum = {{0,0}};
static cl_float2 X2[] = {
    {{1,0}},
    {{2,0}},
    {{-11,0}},
    {{17,0}},
    {{5,0}},
    {{6,0}},
    {{81,0}}
};
static cl_float2 asum2 = {{0,0}};

int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufAsum, scratchBuff;
    cl_event event = NULL;
    int ret = 0;
    int lenX = 1 + (N-1)*abs(incx);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_double2)), NULL, &err);
    bufAsum = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_double2)), NULL, &err);
    // Allocate minimum of N elements
    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_double2)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_double2)), X, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasDzasum( N, bufAsum, 0, bufX, 0, incx, scratchBuff,
                                    1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSasum() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAsum, CL_TRUE, 0, sizeof(cl_double2),
                                    &asum, 0, NULL, NULL);
        //printf("Result : %f\n", asum);
        printf("(%9.2lf, %-9.2lf)\n", CREAL(asum), CIMAG(asum));
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufAsum);
    clReleaseMemObject(scratchBuff);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
///////////////////////////////////////////////now do single precision complex numbers
    //cl_int err;
    platform = 0;
    device = 0;
    //props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    ctx = 0;
    queue = 0;
    cl_mem bufX2, bufAsum2, scratchBuff2;
    event = NULL;
    //int ret = 0;
    //int lenX = 1 + (N-1)*abs(incx);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float2)), NULL, &err);
    bufAsum2 = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float2)), NULL, &err);
    // Allocate minimum of N elements
    scratchBuff2 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float2)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX2, CL_TRUE, 0, (lenX*sizeof(cl_float2)), X2, 0, NULL, NULL);

    /* Fetch results of calculations from GPU memory. */
    err = clEnqueueReadBuffer(queue, bufAsum2, CL_TRUE, 0, sizeof(cl_float2),
                                &asum2, 0, NULL, NULL);
    //printf("Result : %f\n", asum);
    printf("(%9.2lf, %-9.2lf)\n", CREAL(asum2), CIMAG(asum2));

    /* Call clblas function. */
    err = clblasScasum( N, bufAsum2, 0, bufX2, 0, incx, scratchBuff2,
                                    1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSasum() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAsum2, CL_TRUE, 0, sizeof(cl_float2),
                                    &asum2, 0, NULL, NULL);
        //printf("Result : %f\n", asum);
        printf("(%9.2lf, %-9.2lf)\n", CREAL(asum2), CIMAG(asum2));
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX2);
    clReleaseMemObject(bufAsum2);
    clReleaseMemObject(scratchBuff2);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}

This program gives me these results: (123.00, 0.00) (0.00, 1.88) (123.00, 1.88)

I've print the contents of variable asum2 after reading the buffer and before calling Scasum, 1.88 is the initial value of the second component. This issue doesn't occur when I first call Scasum without Dzasum?

mikhail-j commented 8 years ago

When I run Scasum alone, reading the buffer before calling the clBLAS function gives me this result: (0.00, 0.00).

I've used this code below.

/* ************************************************************************
 * check complex
 * ************************************************************************/

#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

/* Include CLBLAS header. It automatically includes needed OpenCL header,
 * so we can drop out explicit inclusion of cl.h header.
 */
#include "clBLAS.h"

/* This example uses predefined matrices and their characteristics for
 * simplicity purpose.
 */
static const size_t N = 7;
static cl_float2 X[] = {
    {{1,0}},
    {{2,0}},
    {{-11,0}},
    {{17,0}},
    {{5,0}},
    {{6,0}},
    {{81,0}}
};
static const int incx = 1;
static cl_float2 asum;
static cl_double2 X2[] = {
    {{1,0}},
    {{2,0}},
    {{-11,0}},
    {{17,0}},
    {{5,0}},
    {{6,0}},
    {{81,0}}
};
static cl_double2 asum2;

int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufAsum, scratchBuff;
    cl_event event = NULL;
    int ret = 0;
    int lenX = 1 + (N-1)*abs(incx);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float2)), NULL, &err);
    bufAsum = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float2)), NULL, &err);
    // Allocate minimum of N elements
    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float2)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float2)), X, 0, NULL, NULL);

    /* Fetch results of calculations from GPU memory. */
    err = clEnqueueReadBuffer(queue, bufAsum, CL_TRUE, 0, sizeof(cl_float2),
                                &asum, 0, NULL, NULL);
    //printf("Result : %f\n", asum);
    printf("(%9.2lf, %-9.2lf)\n", CREAL(asum), CIMAG(asum));

    /* Call clblas function. */
    err = clblasScasum( N, bufAsum, 0, bufX, 0, incx, scratchBuff,
                                    1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSasum() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAsum, CL_TRUE, 0, sizeof(cl_float2),
                                    &asum, 0, NULL, NULL);
        //printf("Result : %f\n", asum);
        printf("(%9.2lf, %-9.2lf)\n", CREAL(asum), CIMAG(asum));
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufAsum);
    clReleaseMemObject(scratchBuff);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
pavanky commented 8 years ago

There is no reason that the second value should be changing like that..

What version of clBLAS are you using ?

mikhail-j commented 8 years ago

I'm testing the Scasum and Dzasum clBLAS functions with both 2.3.0 and 2.10.0 (Hawaii).

I found that running Dzasum after Scasum on a double component vector gives me the expected result: (123.00, 0.00) (0.00,0.00) (123.00, 0.00)

The result (0.00, 0.00) was read using clEnqueueReadBuffer before calling Dzasum. I have copied the following code I used to generate these results.

/* ************************************************************************
 * check complex
 * ************************************************************************/

#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

/* Include CLBLAS header. It automatically includes needed OpenCL header,
 * so we can drop out explicit inclusion of cl.h header.
 */
#include "clBLAS.h"

/* This example uses predefined matrices and their characteristics for
 * simplicity purpose.
 */
static const size_t N = 7;
static cl_float2 X[] = {
    {{1,0}},
    {{2,0}},
    {{-11,0}},
    {{17,0}},
    {{5,0}},
    {{6,0}},
    {{81,0}}
};
static const int incx = 1;
static cl_float2 asum;
static cl_double2 X2[] = {
    {{1,0}},
    {{2,0}},
    {{-11,0}},
    {{17,0}},
    {{5,0}},
    {{6,0}},
    {{81,0}}
};
static cl_double2 asum2;

int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufAsum, scratchBuff;
    cl_event event = NULL;
    int ret = 0;
    int lenX = 1 + (N-1)*abs(incx);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float2)), NULL, &err);
    bufAsum = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float2)), NULL, &err);
    // Allocate minimum of N elements
    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float2)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float2)), X, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasScasum( N, bufAsum, 0, bufX, 0, incx, scratchBuff,
                                    1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSasum() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAsum, CL_TRUE, 0, sizeof(cl_float2),
                                    &asum, 0, NULL, NULL);
        //printf("Result : %f\n", asum);
        printf("(%9.2lf, %-9.2lf)\n", CREAL(asum), CIMAG(asum));
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufAsum);
    clReleaseMemObject(scratchBuff);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
///////////////////////////////////////////////now do single precision complex numbers
    //cl_int err;
    platform = 0;
    device = 0;
    //props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    ctx = 0;
    queue = 0;
    cl_mem bufX2, bufAsum2, scratchBuff2;
    event = NULL;
    //int ret = 0;
    //int lenX = 1 + (N-1)*abs(incx);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX2 = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_double2)), NULL, &err);
    bufAsum2 = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_double2)), NULL, &err);
    // Allocate minimum of N elements
    scratchBuff2 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_double2)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX2, CL_TRUE, 0, (lenX*sizeof(cl_double2)), X2, 0, NULL, NULL);

    /* Fetch results of calculations from GPU memory. */
    err = clEnqueueReadBuffer(queue, bufAsum2, CL_TRUE, 0, sizeof(cl_double2),
                                &asum2, 0, NULL, NULL);
    //printf("Result : %f\n", asum);
    printf("(%9.2lf, %-9.2lf)\n", CREAL(asum2), CIMAG(asum2));

    /* Call clblas function. */
    err = clblasDzasum( N, bufAsum2, 0, bufX2, 0, incx, scratchBuff2,
                                    1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSasum() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAsum2, CL_TRUE, 0, sizeof(cl_double2),
                                    &asum2, 0, NULL, NULL);
        //printf("Result : %f\n", asum);
        printf("(%9.2lf, %-9.2lf)\n", CREAL(asum2), CIMAG(asum2));
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX2);
    clReleaseMemObject(bufAsum2);
    clReleaseMemObject(scratchBuff2);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
kknox commented 8 years ago

Hi @mikhail-j I have run the sample code you provided in this comment. I can not find anything wrong with the behavior of clBLAS with the code you provided.

This program gives me these results: (123.00, 0.00)

This is the contents of the double precision bufAsum result buffer of your clblasDzasum call, after reading it to host. Looks good, but you are lucky the imaginary component is 0 because you never initialized bufAsum, and clblasDzasum never wrote to it.

(0.00, 1.88)

Before calling clblasScasum, you print the contents of your GPU allocated single precision result buffer bufAsum2. You don't initialize it to a known value; you are printing what is randomly contained in GPU memory, and it has junk in the imaginary component.

(123.00, 1.88)

This is the contents of the single precision bufAsum2 result buffer of your clblasSzasum call, after reading it to host. The GPU kernel did not clobber the uninitialized imaginary value that was present.

I believe that this is an invalid issue, and will be closing this next week unless provided with more data.

@pavanky Thanks for your help

mikhail-j commented 8 years ago

Okay, I guess that is the expected behavior.

@pavanky Thanks for the timely feedback.

pavanky commented 8 years ago

@kknox Well you put it better than I was trying to. I guess I misread some of the results because I could have sworn I saw the imaginary value change before and after the call. It could also have been me imagining things because it was tired :D