mark-poscablo / gpu-sum-reduction

CUDA implementation of the fundamental sum reduce operation. Aims to be as optimized as reasonable.
35 stars 10 forks source link

the wrong result when replace warpReduce with real code block. #1

Open XiaotaoChen opened 4 years ago

XiaotaoChen commented 4 years ago

Hi, I'm learning the reduction of cuda with nvidia doc, I unroll the last warp with device funciton WarpReduce, the result is correct. however, the result is wrong when i replace warpReduce with the real code blocks. According to your code in reduce.cu, you used the code blocks instead of warpReduce function. And i test your code, result is correct. i don't what's wrong with my code. can you review my code to find out what's wrong about my code. Thanks. the test code and it's output as belows. cuda code

#include <cstdio>
#include <vector>
#include <cuda_runtime.h>

// unroll the last warp
__device__ void warpFunc(volatile int* sA, int tid) {
    sA[tid] += sA[tid+32];
    sA[tid] += sA[tid+16];
    sA[tid] += sA[tid+8];
    sA[tid] += sA[tid+4];
    sA[tid] += sA[tid+2];
    sA[tid] += sA[tid+1];
}

__global__ void reduce_kernel3(int*A, int*out, int len_a) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int bid = blockIdx.x;
    int tid = threadIdx.x;
    extern __shared__ int sA[];
    sA[tid] = 0;
    if (idx < len_a) sA[tid] = A[idx];
    __syncthreads();
    for (int s=blockDim.x/2; s>32; s>>=1) {
        if(tid <s) {
          sA[tid] += sA[tid+s];
        }
        __syncthreads();
    }
    if (tid<32) warpFunc(sA, tid);
    if (tid==0) out[bid] = sA[0];
}

__global__ void reduce_kernel3_2(int*A, int*out, int len_a) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int bid = blockIdx.x;
    int tid = threadIdx.x;
    extern __shared__ int sA[];
    sA[tid] = 0;
    if (idx < len_a) sA[tid] = A[idx];
    __syncthreads();
    for (int s=blockDim.x/2; s>32; s>>=1) {
        if(tid <s) {
          sA[tid] += sA[tid+s];
        }
        __syncthreads();
    }
    if (tid<32) {
        sA[tid] += sA[tid+32];
        sA[tid] += sA[tid+16];
        sA[tid] += sA[tid+8];
        sA[tid] += sA[tid+4];
        sA[tid] += sA[tid+2];
        sA[tid] += sA[tid+1];
    }
    if (tid==0) out[bid] = sA[0];
}

__global__ void reduce_kernel3_3(int*A, int*out, int len_a) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int bid = blockIdx.x;
    int tid = threadIdx.x;
    extern __shared__ int sA[];
    sA[tid] = 0;
    if (idx < len_a) sA[tid] = A[idx];
    __syncthreads();
    for (int s=blockDim.x/2; s>32; s>>=1) {
        if(tid <s) {
          sA[tid] += sA[tid+s];
        }
        __syncthreads();
    }
    if (tid<32) {
        sA[tid] += sA[tid+32]; __syncthreads();
        sA[tid] += sA[tid+16]; __syncthreads();
        sA[tid] += sA[tid+8]; __syncthreads();
        sA[tid] += sA[tid+4]; __syncthreads();
        sA[tid] += sA[tid+2]; __syncthreads();
        sA[tid] += sA[tid+1]; __syncthreads();
    }
    if (tid==0) out[bid] = sA[0];
}

int sum_on_cpu(int *data, int len) {
    int result = 0;
    for (int i=0; i<len; i++) result += data[i];
    return result;
}

int sum_on_gpu(int *data, int len, int numThreads, int flag) {
    int numBlocks = (len + numThreads -1) / numThreads;
    int*tmp;
    cudaMalloc((void**)&tmp, numBlocks*sizeof(int));
    while(len>1) {
        if (flag==0) {
            reduce_kernel3<<<numBlocks, numThreads, numThreads*sizeof(int)>>>(data, tmp, len);
        }
        else if (flag==1){
            reduce_kernel3_2<<<numBlocks, numThreads, numThreads*sizeof(int)>>>(data, tmp, len);
        }
        else{
            reduce_kernel3_3<<<numBlocks, numThreads, numThreads*sizeof(int)>>>(data, tmp, len);
        }

        len = numBlocks;
        numBlocks = (numBlocks+numThreads-1)/numThreads;
        cudaMemcpy(data, tmp, sizeof(int) * len, cudaMemcpyDeviceToDevice);
    }
    int result;
    cudaMemcpy(&result, tmp, sizeof(int), cudaMemcpyDeviceToHost);
    return result;
}

int main() {
    int len = 1024 * 1024;
    int numThreads = 256;
    int *h_data = (int*) malloc(len * sizeof(int));
    std::fill_n(h_data, len, 1);

    int* d_data1, *d_data2, *d_data3;
    cudaMalloc((void**)&d_data1, len * sizeof(int));
    cudaMemcpy(d_data1, h_data, len*sizeof(int), cudaMemcpyHostToDevice);
    cudaMalloc((void**)&d_data2, len * sizeof(int));
    cudaMemcpy(d_data2, h_data, len*sizeof(int), cudaMemcpyHostToDevice);
    cudaMalloc((void**)&d_data3, len * sizeof(int));
    cudaMemcpy(d_data3, h_data, len*sizeof(int), cudaMemcpyHostToDevice);

    int cpu_result = sum_on_cpu(h_data, len);
    int gpu_result1 = sum_on_gpu(d_data1, len, numThreads, 0);
    int gpu_result2 = sum_on_gpu(d_data2, len, numThreads, 1);
    int gpu_result3 = sum_on_gpu(d_data3, len, numThreads, 2);
    printf("result: cpu:%d, gpu k1:%d, k2:%d, k3:%d\n", cpu_result, gpu_result1, gpu_result2, gpu_result3);

    cudaFree(d_data1);
    cudaFree(d_data2);
    cudaFree(d_data3);
    free(h_data);
}

it's output

result: cpu:1048576, gpu k1:1048576, k2:3920, k3:1048576
gau-nernst commented 7 months ago

This is a very old issue, I hope you figured it out. I also faced the same problem recently. The author also noted about this issue in the README.

For some unknown reason (for now), incorporating loop unrolling in the implementation makes the Debug compiled code output the correct values, while the Release compiled code outputs incorrect values.

What I think was happening is that, without marking shared memory as volatile, the compiler optimizes away the memory read. In other words, it assumes that sA[tid + 16], sA[tid + 8], ... has not changed, thus it will load all of them at the start and add to the results. This is why this problem only occurs in Release mode.

Apart from calling warpFunc(), which marks the pointer as volatile, I found that simply re-cast the pointer to volatile also works.

    volatile float *_shmem = shmem;
    for (int stride = warpSize; stride > 0; stride /= 2)
      _shmem[tid] += _shmem[tid + stride];

Calling __syncthreads() also works as you did in your version 3, since it will make a memory read. __syncwarp() is also possible from my experiments.

Hope this also helps other people!