Closed jinz2014 closed 4 years ago
I have a question about the device sum in the basic cuda directory. Thanks
Are the following reductions (host and device) the same ?
signed char* lattice_b_h = (signed char*) malloc(nx * ny/2 * sizeof(*lattice_b_h)); signed char* lattice_w_h = (signed char*) malloc(nx * ny/2 * sizeof(*lattice_w_h)); CHECK_CUDA(cudaMemcpy(lattice_b_h, lattice_b, nx * ny/2 * sizeof(*lattice_b), cudaMemcpyDeviceToHost)); CHECK_CUDA(cudaMemcpy(lattice_w_h, lattice_b, nx * ny/2 * sizeof(*lattice_w), cudaMemcpyDeviceToHost)); double naivesum = 0.0; for (int i = 0; i < nx*ny/2; i++) { naivesum += lattice_b_h[i]; naivesum += lattice_w_h[i]; }
// Reduce double* devsum; int nchunks = (nx * ny/2 + CUB_CHUNK_SIZE - 1)/ CUB_CHUNK_SIZE; CHECK_CUDA(cudaMalloc(&devsum, 2 * nchunks * sizeof(*devsum))); size_t cub_workspace_bytes = 0; void* workspace = NULL; CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, lattice_b, devsum, CUB_CHUNK_SIZE)); CHECK_CUDA(cudaMalloc(&workspace, cub_workspace_bytes)); for (int i = 0; i < nchunks; i++) { CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, &lattice_b[i*CUB_CHUNK_SIZE], devsum + 2*i, std::min((long long) CUB_CHUNK_SIZE, nx * ny/2 - i * CUB_CHUNK_SIZE))); CHECK_CUDA(cub::DeviceReduce::Sum(workspace, cub_workspace_bytes, &lattice_w[i*CUB_CHUNK_SIZE], devsum + 2*i + 1, std::min((long long) CUB_CHUNK_SIZE, nx * ny/2 - i * CUB_CHUNK_SIZE))); } double* hostsum; hostsum = (double*)malloc(2 * nchunks * sizeof(*hostsum)); CHECK_CUDA(cudaMemcpy(hostsum, devsum, 2 * nchunks * sizeof(*devsum), cudaMemcpyDeviceToHost)); double fullsum = 0.0; for (int i = 0; i < 2 * nchunks; i++) { fullsum += hostsum[i]; }
The naive sum code snippet you posted has an error. The second cudaMemcpy call is copying lattice_b to lattice_w_h. Otherwise, the sums should be the same.
cudaMemcpy
lattice_b
lattice_w_h
Thank you!
I have a question about the device sum in the basic cuda directory. Thanks
Are the following reductions (host and device) the same ?