Potential Bugs in cuda code of Fortran array. The case where input Matrix is Fortran array, and axis == 0. The kernel code seems to use the wrong indexing. Here is our workable version, please confirm.
if axis == 0:
row_kernel(x_gpu, a_gpu, out, n, m,
block=block, grid=grid, stream=stream,
shared=24*x_gpu.dtype.itemsize)
__global__ void opRowVecToMat(const ${type}* mat, const ${type}* vec, ${type}* out,
const int n, const int m){
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
const int tidy = blockIdx.y * blockDim.y + threadIdx.y;
extern __shared__ ${type} shared_vec[];
if ((ty == 0) & (tidx < n))
shared_vec[tx] = vec[tidx];
__syncthreads();
if ((tidy < m) & (tidx < n)) {
out[tidy*n+tidx] = mat[tidy*n+tidx] ${binary_op} shared_vec[tx];
}
}
We haven't verify, but we think axis == 1 case for Fortran array is also buggy. The following are what we think how the code supposed to be like.
elif axis == 1:
col_kernel(x_gpu, a_gpu, out, n, m,
block=block, grid=grid, stream=stream,
shared=24*x_gpu.dtype.itemsize)
__global__ void opColVecToMat(const ${type} *mat, const ${type} *vec, ${type} *out,
const int n, const int m){
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
const int tidy = blockIdx.y * blockDim.y + threadIdx.y;
extern __shared__ ${type} shared_vec[];
if ((tx == 0) & (tidy < m))
shared_vec[ty] = vec[tidy];
__syncthreads();
if ((tidy < m) & (tidx < n)) {
out[tidy*n+tidx] = mat[tidy*n+tidx] ${binary_op} shared_vec[ty];
}
}
Environment
CUDA - 11.0.0
skcuda - 0.5.2
pycuda - 2020.1
python - 3. 7. 4
Linux - ubuntu- 18.04
Problem
Potential Bugs in cuda code of Fortran array. The case where input Matrix is Fortran array, and
axis == 0
. The kernel code seems to use the wrong indexing. Here is our workable version, please confirm.We haven't verify, but we think
axis == 1
case for Fortran array is also buggy. The following are what we think how the code supposed to be like.Environment
CUDA - 11.0.0 skcuda - 0.5.2 pycuda - 2020.1 python - 3. 7. 4 Linux - ubuntu- 18.04