Closed hughperkins closed 8 years ago
Note that it is sufficient to run the original sger example, but just check clFinish afterwards. Both rowmajor and columnmajor cause out-of-bounds access, on these tiny 5x5 matrices :-(
/* ************************************************************************
* Copyright 2013 Advanced Micro Devices, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ************************************************************************/
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
/* Include CLBLAS header. It automatically includes needed OpenCL header,
* so we can drop out explicit inclusion of cl.h header.
*/
#include <clBLAS.h>
/* This example uses predefined matrices and their characteristics for
* simplicity purpose.
*/
static const clblasOrder order = clblasRowMajor;
static const size_t M = 5;
static const size_t N = 5;
static const cl_float alpha = 10;
static cl_float A[] = {
11, 12, 13, 14, 15,
21, 22, 23, 24, 25,
31, 32, 33, 34, 35,
41, 42, 43, 44, 45,
81, 22, 33, 14, 75
};
static const size_t lda = 5; /* i.e. lda = N */
static const cl_float X[] = {
11,
21,
31,
41,
51,
91,
};
static const int incx = 1;
static const cl_float Y[] = {
45,
23,
39,
45,
50,
10,
};
static const int incy = 1;
static void
printResult(void)
{
size_t i, j;
printf("\nResult:\n");
for (i = 0; i < M; i++) {
for(j = 0; j < N; j++)
printf("\t%f", A[ i*N + j ]);
printf("\n");
}
}
int
main(void)
{
cl_int err;
cl_platform_id platform = 0;
cl_device_id device = 0;
cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
cl_context ctx = 0;
cl_command_queue queue = 0;
cl_mem bufA, bufX, bufY;
cl_event event = NULL;
int ret = 0;
/* Setup OpenCL environment. */
err = clGetPlatformIDs(1, &platform, NULL);
if (err != CL_SUCCESS) {
printf( "clGetPlatformIDs() failed with %d\n", err );
return 1;
}
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (err != CL_SUCCESS) {
printf( "clGetDeviceIDs() failed with %d\n", err );
return 1;
}
props[1] = (cl_context_properties)platform;
ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
if (err != CL_SUCCESS) {
printf( "clCreateContext() failed with %d\n", err );
return 1;
}
queue = clCreateCommandQueue(ctx, device, 0, &err);
if (err != CL_SUCCESS) {
printf( "clCreateCommandQueue() failed with %d\n", err );
clReleaseContext(ctx);
return 1;
}
/* Setup clblas. */
err = clblasSetup();
if (err != CL_SUCCESS) {
printf("clblasSetup() failed with %d\n", err);
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
return 1;
}
/* Prepare OpenCL memory objects and place matrices inside them. */
bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * lda * sizeof(cl_float),
NULL, &err);
bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, ( 1 + ( M - 1 )*abs( incx ) ) * sizeof(cl_float),
NULL, &err);
bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, ( 1 + ( N - 1 )*abs( incy ) ) * sizeof(cl_float),
NULL, &err);
err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
M * lda * sizeof(cl_float), A, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
( 1 + ( M - 1 )*abs( incx ) ) * sizeof(cl_float), X, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
( 1 + ( N - 1 )*abs( incy ) ) * sizeof(cl_float), Y, 0, NULL, NULL);
/* Call clblas function. */
err = clFinish(queue);
printf("err %i\n", err);
err = clblasSger(order, M, N, alpha, bufX, 0, incx, bufY, 0, incy,
bufA, 0, lda, 1, &queue, 0, NULL, &event);
if (err != CL_SUCCESS) {
printf("clblasSger() failed with %d\n", err);
ret = 1;
}
else {
/* Wait for calculations to be finished. */
err = clWaitForEvents(1, &event);
/* Fetch results of calculations from GPU memory. */
err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (M * lda * sizeof(cl_float)),
A, 0, NULL, NULL);
/* At this point you will get the result of SGER placed in A array. */
printResult();
}
err = clFinish(queue);
printf("err %i\n", err);
/* Release OpenCL memory objects. */
clReleaseMemObject(bufY);
clReleaseMemObject(bufX);
clReleaseMemObject(bufA);
/* Finalize work with clblas. */
clblasTeardown();
/* Release OpenCL working objects. */
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
return ret;
}
(Basically, it looks like it works, but when you call clFinish, you can see that ... it crashed the commandqueue :-/ In 100 times out of 100, in my experience, this means that a read or a write went off the end of an array. Typically because this lacks a guard like if( get_global_id(0) < N ) { .... do stuff here... }
)
Note that it is sufficient for me to put return
at line 181 of src/library/blas/gens/clTemplates/ger.cl, to prevnet the commandqueue crash. Of course, this completely nullifies the effect of calling sger, but does demonstrate that the issue is probably inside this kernel.
changing to columnmajor (my own use case), and moving the return to line 73, also no error.
putting return
on line 75 => crashes
putting return
on line 102 => no crash
=> therefore the else
section of the if...else...
is being used.
return at line 140 => crash return at line 127 => no crash => therefore the first section of the line 125-150 if..else section is being used.
return just before line 135 => no crash return just after line 135 => crash => therefore the issue is at line 135:
%VSTORE( prevA, 0 , ( A + col*lda + row ) );
This is maybe substituted by a call to handleAlignedVSTORE in blas/gens/kprintf.cpp:
}else if (strcmp(f.key, "%VSTORE") == 0)
{
handleAlignedVSTORE(&src, &dst);
I guess that this causes the following opencl command:
A[col*lda+row] = prevA;
(just a guess though...) Therefore, if col*lda+row is larger than number of elements in A => out of bounds access
Ideally, one would think this would be guarded by the following if
s in the kernel:
if ( gTIDx < N) // if whithin last column
{
if( (row + %V - 1) < M ) // if the next V rows are still within M, then do vector math
{
so, why are they not? :-P
Anyway, A is M by N, therefore we can guard this by putting at line 121:
bool withinBounds = row < M && col < N; // prevent short-cutting causing extra execution paths
if( withinBounds ) {
... and then adding a }
at line 151 ish
(edited to fix typo in withinBounds
)
Hmmm, still get a -36 error :-/
Sooo... maybe its not the VSTORE, maybe the return just prevented a later loop iteration taht caused the -36? Will comment out instead
comment out entire 129-137block => no command queue crash
uncomment everything except VSTORE line => no command queue crash uncomment in addition the VSTORE line => command queue crash => seems it really is the vstore ... unless the lack of a vstore causes the compiler to 'optimize out' a ton of code :-(
After turning off compiler options (see comment below), commenting out VSTORE is NOT sufficient to prevent crash.
... but commenting out the whole 133-137 block is :-)
Its the %VMAD. Following code does not crash command qeuue:
prevA = %VLOAD( 0, ( A + col*lda + row ) );
%CONJUGATE(doConj, yReg);
%VMUL( temp, xReg, alpha );
//%VMAD( prevA, temp, yReg);
%VSTORE( prevA, 0 , ( A + col*lda + row ) );
(ie with VMAD commented out)
@hughperkins Would it be possible to update a single comment instead of creating new ones ? It makes it easier to track your progress and will also decrease the number of notifications we receive.
Hi guys, where can I pass a flag to kernel compiler to turn off kernel compiler optimization?
Ok, line 138ish of blas/gens/ger_lds.cpp perhasp?
Like this perhaps?
addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-cl-opt-disable");
Starting a new comment, since the last one is getting a bit long to keep scrolling up . sorry :-P Summary so far: commenting out the VMAD line, with compiler optimizations off, is sufficnet to prevent command queue crash.
0
at line 129, '%TYPE yReg = localY[ tIDx ];' does not prevent crash => might not be the localY accessprevA = temp;
=> crashprevA = yReg;
=> no crashtemp
?prevA = temp;
, comment out %VMUL
line => no crash :-P0
at line 130 => still crashes :-(%TYPE%V xReg = 0;
=> no crash :-) maybe getting close :-)%TYPE%V xReg = localX[0];
=> no crash :-P%TYPE
to %TYPE%V
?__local %TYPE%V*localXV = (__local %TYPE%V*)localX;
:
%TYPE%V xReg = localX[0];
does not crash%TYPE%V xReg = localXV[0];
crashesif
staement on line 126 to if( false && (row + %V - 1) < M )
=> no crash :-)Guys, I'm fairly sure that on nVidia, there is no speed advantage to doing vector maths in gpu kernels. I can find evidence for this if you need? Is there some way we can either:
Ah-hah! I can also fix it by changing line 56 :-) to:
__local %TYPE%V localXV[ BH ];
__local %TYPE *localX = (__local %TYPE *)localXV;
It's an alignment issue I guess :-)
(By the way, changing VEC_LEN to 1 in ger_lds.cpp also fixes the issue; and also gives correct results)
Is this fixed by merging pr #115? If so can we close it?
Great! Thanks! :-)
following code falis, with error -36 'invalid queue'. This usually menas an out of bounds array access occurred.