sger fails on nvidia - Githubissues

hughperkins commented 9 years ago

following code falis, with error -36 'invalid queue'. This usually menas an out of bounds array access occurred.

/* ************************************************************************
 * Copyright 2013 Advanced Micro Devices, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * ************************************************************************/
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
/* Include CLBLAS header. It automatically includes needed OpenCL header,
 * so we can drop out explicit inclusion of cl.h header.
 */
#include <clBLAS.h>
/* This example uses predefined matrices and their characteristics for
 * simplicity purpose.
 */
static const clblasOrder order = clblasColumnMajor;
static const size_t M = 5;
static const size_t N = 5;
static const cl_float alpha = 1;
static const size_t lda = 5;
static const int incx = 1;
static const int incy = 1;
/*static void*/
/*printResult(void)*/
/*{*/
/*    size_t i, j;*/
/*    printf("\nResult:\n");*/
/*    for (i = 0; i < M; i++) {*/
/*        for(j = 0; j < N; j++)*/
/*            printf("\t%f", A[ i*N + j ]);*/
/*        printf("\n");*/
/*    }*/
/*}*/
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufX, bufY;
    cl_event event = NULL;
    int ret = 0;
    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }
    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }
    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }
    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }
    cl_float *pX = (cl_float *)malloc(M * sizeof(cl_float));
    cl_float *pY = (cl_float *)malloc(N * sizeof(cl_float));
    cl_float *pA = (cl_float *)malloc(M*N* sizeof(cl_float));
    memset(pX, 0, M*sizeof(cl_float));
    memset(pY, 0, N*sizeof(cl_float));
    memset(pA, 0, M*N*sizeof(cl_float));
    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * lda * sizeof(cl_float),
                          NULL, &err);
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, ( 1 + ( M - 1 )*abs( incx ) ) * sizeof(cl_float),
                          NULL, &err);
    bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, ( 1 + ( N - 1 )*abs( incy ) ) * sizeof(cl_float),
                          NULL, &err);
    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
        M * lda * sizeof(cl_float), pA, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
        ( 1 + ( M - 1 )*abs( incx ) ) * sizeof(cl_float), pX, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
        ( 1 + ( N - 1 )*abs( incy ) ) * sizeof(cl_float), pY, 0, NULL, NULL);
    /* Call clblas function. */
    err = clFinish(queue);
    printf("err %i\n", err);
    err = clblasSger(order, M, N, alpha, bufX, 0, incx, bufY, 0, incy,
        bufA, 0, lda, 1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSger() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);
        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (M * lda * sizeof(cl_float)),
                                  pA, 0, NULL, NULL);
        /* At this point you will get the result of SGER placed in A array. */
/*        printResult();*/
    }
    err = clFinish(queue);
    printf("err %i\n", err);
    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufY);
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufA);
    free(pX);
    free(pY);
    free(pA);
    /* Finalize work with clblas. */
    clblasTeardown();
    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
    return ret;
}

hughperkins commented 9 years ago

Note that it is sufficient to run the original sger example, but just check clFinish afterwards. Both rowmajor and columnmajor cause out-of-bounds access, on these tiny 5x5 matrices :-(

/* ************************************************************************
 * Copyright 2013 Advanced Micro Devices, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * ************************************************************************/
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
/* Include CLBLAS header. It automatically includes needed OpenCL header,
 * so we can drop out explicit inclusion of cl.h header.
 */
#include <clBLAS.h>
/* This example uses predefined matrices and their characteristics for
 * simplicity purpose.
 */
static const clblasOrder order = clblasRowMajor;
static const size_t M = 5;
static const size_t N = 5;
static const cl_float alpha = 10;
static cl_float A[] = {
    11, 12, 13, 14, 15,
    21, 22, 23, 24, 25,
    31, 32, 33, 34, 35,
    41, 42, 43, 44, 45,
    81, 22, 33, 14, 75
};
static const size_t lda = 5;    /* i.e. lda = N */
static const cl_float X[] = {
    11,
    21,
    31,
    41,
    51,
    91,
};
static const int incx = 1;
static const cl_float Y[] = {
    45,
    23,
    39,
    45,
    50,
    10,
};
static const int incy = 1;
static void
printResult(void)
{
    size_t i, j;
    printf("\nResult:\n");
    for (i = 0; i < M; i++) {
        for(j = 0; j < N; j++)
            printf("\t%f", A[ i*N + j ]);
        printf("\n");
    }
}
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufX, bufY;
    cl_event event = NULL;
    int ret = 0;
    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }
    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }
    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }
    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }
    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * lda * sizeof(cl_float),
                          NULL, &err);
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, ( 1 + ( M - 1 )*abs( incx ) ) * sizeof(cl_float),
                          NULL, &err);
    bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, ( 1 + ( N - 1 )*abs( incy ) ) * sizeof(cl_float),
                          NULL, &err);
    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
        M * lda * sizeof(cl_float), A, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
        ( 1 + ( M - 1 )*abs( incx ) ) * sizeof(cl_float), X, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
        ( 1 + ( N - 1 )*abs( incy ) ) * sizeof(cl_float), Y, 0, NULL, NULL);
    /* Call clblas function. */
    err = clFinish(queue);
    printf("err %i\n", err);
    err = clblasSger(order, M, N, alpha, bufX, 0, incx, bufY, 0, incy,
        bufA, 0, lda, 1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSger() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);
        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (M * lda * sizeof(cl_float)),
                                  A, 0, NULL, NULL);
        /* At this point you will get the result of SGER placed in A array. */
        printResult();
    }
    err = clFinish(queue);
    printf("err %i\n", err);
    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufY);
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufA);
    /* Finalize work with clblas. */
    clblasTeardown();
    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
    return ret;
}

hughperkins commented 9 years ago

(Basically, it looks like it works, but when you call clFinish, you can see that ... it crashed the commandqueue :-/ In 100 times out of 100, in my experience, this means that a read or a write went off the end of an array. Typically because this lacks a guard like if( get_global_id(0) < N ) { .... do stuff here... } )

hughperkins commented 9 years ago

Note that it is sufficient for me to put return at line 181 of src/library/blas/gens/clTemplates/ger.cl, to prevnet the commandqueue crash. Of course, this completely nullifies the effect of calling sger, but does demonstrate that the issue is probably inside this kernel.

hughperkins commented 9 years ago

changing to columnmajor (my own use case), and moving the return to line 73, also no error.

hughperkins commented 9 years ago

putting return on line 75 => crashes putting return on line 102 => no crash => therefore the else section of the if...else... is being used.

hughperkins commented 9 years ago

return at line 140 => crash return at line 127 => no crash => therefore the first section of the line 125-150 if..else section is being used.

hughperkins commented 9 years ago

return just before line 135 => no crash return just after line 135 => crash => therefore the issue is at line 135:

%VSTORE( prevA, 0 , ( A + col*lda + row ) );

hughperkins commented 9 years ago

This is maybe substituted by a call to handleAlignedVSTORE in blas/gens/kprintf.cpp:

                }else if (strcmp(f.key, "%VSTORE") == 0)
                {
                    handleAlignedVSTORE(&src, &dst);

I guess that this causes the following opencl command:

A[col*lda+row] = prevA;

(just a guess though...) Therefore, if col*lda+row is larger than number of elements in A => out of bounds access

Ideally, one would think this would be guarded by the following ifs in the kernel:

        if ( gTIDx < N)  // if whithin last column
        {
            if( (row + %V - 1) < M )  // if the next V rows are still within M, then do vector math
            {

so, why are they not? :-P

Anyway, A is M by N, therefore we can guard this by putting at line 121:

bool withinBounds = row < M && col < N; // prevent short-cutting causing extra execution paths
if( withinBounds ) {

... and then adding a } at line 151 ish

(edited to fix typo in withinBounds)

Hmmm, still get a -36 error :-/

Sooo... maybe its not the VSTORE, maybe the return just prevented a later loop iteration taht caused the -36? Will comment out instead

comment out entire 129-137block => no command queue crash

uncomment everything except VSTORE line => no command queue crash uncomment in addition the VSTORE line => command queue crash => seems it really is the vstore ... unless the lack of a vstore causes the compiler to 'optimize out' a ton of code :-(

After turning off compiler options (see comment below), commenting out VSTORE is NOT sufficient to prevent crash.

... but commenting out the whole 133-137 block is :-)

Its the %VMAD. Following code does not crash command qeuue:

                prevA = %VLOAD( 0, ( A + col*lda + row ) );
                %CONJUGATE(doConj, yReg);
                %VMUL( temp, xReg, alpha );
                //%VMAD( prevA, temp, yReg);
                %VSTORE( prevA, 0 , ( A + col*lda + row ) );

(ie with VMAD commented out)

pavanky commented 9 years ago

@hughperkins Would it be possible to update a single comment instead of creating new ones ? It makes it easier to track your progress and will also decrease the number of notifications we receive.

hughperkins commented 9 years ago

Hi guys, where can I pass a flag to kernel compiler to turn off kernel compiler optimization?

Ok, line 138ish of blas/gens/ger_lds.cpp perhasp?

Like this perhaps?

addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-cl-opt-disable");

hughperkins commented 9 years ago

Starting a new comment, since the last one is getting a bit long to keep scrolling up . sorry :-P Summary so far: commenting out the VMAD line, with compiler optimizations off, is sufficnet to prevent command queue crash.

uncommenting VMAD, and changing tIDx to 0 at line 129, '%TYPE yReg = localY[ tIDx ];' does not prevent crash => might not be the localY access
... so thats bizarre since the other parameters to VMAD are scalars?
recheck: commenting out VMAD really does prevent command queue crash. Odd :-P
replace vmad line with prevA = temp; => crash
replace vmad line with prevA = yReg; => no crash
plausibly something to do with temp?
replace vmad line with prevA = temp;, comment out %VMUL line => no crash :-P
oh, commenting out just the %VMUL line, and leaving vmad line, also no crash :-) => perhaps is the VMUL line in fact?
uncommenting %VMUL, and changing the localX index to 0 at line 130 => still crashes :-(
but changing this entire line to %TYPE%V xReg = 0; => no crash :-) maybe getting close :-)
putting the TYPE%V xReg line back to as before, and commenting out lines 79 and 108 (which set values in the local array) => still crashes :-/
oh thats interesting, %TYPE%V is a float4, a vector type right?
oh... is it possible taht the float4 goes off the edge of the array?
hmmm, but using a 4x4 matrix, in sger.c, instead of 5x5, still crashes the command queue
- 8x8 crashes it too
- 1x1 no crash though :-P
- 2x2 no crash
- 3x3 no crash
- (but 4x4: crash)
but is this because for 3x3 and below, we dont use float4?
- no: 3x3 still uses float4
- ( can check this by the way very simply: simply duplicate line 130, this will then cause a runtime error complaining about xReg already being defined, and telling us the type of xReg)
replacing line 130 with %TYPE%V xReg = localX[0]; => no crash :-P
is it bcause we are casting localX from %TYPE to %TYPE%V?
with the following line added before line 130 __local %TYPE%V*localXV = (__local %TYPE%V*)localX;:
- %TYPE%V xReg = localX[0]; does not crash
- %TYPE%V xReg = localXV[0]; crashes
changing the if staement on line 126 to if( false && (row + %V - 1) < M ) => no crash :-)

hughperkins commented 9 years ago

Guys, I'm fairly sure that on nVidia, there is no speed advantage to doing vector maths in gpu kernels. I can find evidence for this if you need? Is there some way we can either:

provide an option to clBLAS users to turn off vector maths? and / or
detect nvidia, and turn off vector maths automatically on nvidia?

hughperkins commented 9 years ago

Ah-hah! I can also fix it by changing line 56 :-) to:

    __local %TYPE%V localXV[ BH ];
    __local %TYPE *localX = (__local %TYPE *)localXV;

It's an alignment issue I guess :-)

(By the way, changing VEC_LEN to 1 in ger_lds.cpp also fixes the issue; and also gives correct results)

TimmyLiu commented 8 years ago

Is this fixed by merging pr #115? If so can we close it?

hughperkins commented 8 years ago

Great! Thanks! :-)

clMathLibraries / clBLAS

sger fails on nvidia #108