For the following codes in miniFE, the comments show that syncthreads() are not needed in a warp. However, I think syncthreads() are actually needed to produce correct sum results. I got incorrect sum results when omitting them. Could you reproduce the issue ? Thank you for your comments.
template<typename Vector>
__global__ void dot_kernel(const Vector x, const Vector y, typename TypeTraits<typename Vector::ScalarType>::magnitude_type *d) {
typedef typename TypeTraits<typename Vector::ScalarType>::magnitude_type magnitude;
const int BLOCK_SIZE=512;
magnitude sum=0;
for(int idx=blockIdx.x*blockDim.x+threadIdx.x;idx<x.n;idx+=gridDim.x*blockDim.x) {
sum+=x.coefs[idx]*y.coefs[idx];
}
//Do a shared memory reduction on the dot product
__shared__ volatile magnitude red[BLOCK_SIZE];
red[threadIdx.x]=sum;
//__syncthreads(); if(threadIdx.x<512) {sum+=red[threadIdx.x+512]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<256) {sum+=red[threadIdx.x+256]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<128) {sum+=red[threadIdx.x+128]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<64) {sum+=red[threadIdx.x+64]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<32) {sum+=red[threadIdx.x+32]; red[threadIdx.x]=sum;}
//the remaining ones don't need syncthreads because they are warp synchronous
if(threadIdx.x<16) {sum+=red[threadIdx.x+16]; red[threadIdx.x]=sum;}
if(threadIdx.x<8) {sum+=red[threadIdx.x+8]; red[threadIdx.x]=sum;}
if(threadIdx.x<4) {sum+=red[threadIdx.x+4]; red[threadIdx.x]=sum;}
if(threadIdx.x<2) {sum+=red[threadIdx.x+2]; red[threadIdx.x]=sum;}
if(threadIdx.x<1) {sum+=red[threadIdx.x+1];}
//save partial dot products
if(threadIdx.x==0) d[blockIdx.x]=sum;
}
template<typename Scalar>
__global__ void dot_final_reduce_kernel(Scalar *d) {
const int BLOCK_SIZE=1024;
Scalar sum=d[threadIdx.x];
__shared__ volatile Scalar red[BLOCK_SIZE];
red[threadIdx.x]=sum;
__syncthreads(); if(threadIdx.x<512) {sum+=red[threadIdx.x+512]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<256) {sum+=red[threadIdx.x+256]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<128) {sum+=red[threadIdx.x+128]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<64) {sum+=red[threadIdx.x+64]; red[threadIdx.x]=sum;}
__syncthreads(); if(threadIdx.x<32) {sum+=red[threadIdx.x+32]; red[threadIdx.x]=sum;}
//the remaining ones don't need syncthreads because they are warp synchronous
if(threadIdx.x<16) {sum+=red[threadIdx.x+16]; red[threadIdx.x]=sum;}
if(threadIdx.x<8) {sum+=red[threadIdx.x+8]; red[threadIdx.x]=sum;}
if(threadIdx.x<4) {sum+=red[threadIdx.x+4]; red[threadIdx.x]=sum;}
if(threadIdx.x<2) {sum+=red[threadIdx.x+2]; red[threadIdx.x]=sum;}
if(threadIdx.x<1) {sum+=red[threadIdx.x+1];}
//save final dot product at the front
if(threadIdx.x==0) d[0]=sum;
}
#define BLOCK_SIZE 256
#include <stdio.h>
#include <cuda.h>
__global__ void dot_kernel(const int n, const int* x, const int* y, int *d) {
int sum=0;
for(int idx=blockIdx.x*blockDim.x+threadIdx.x;idx<n;idx+=gridDim.x*blockDim.x) {
sum+=x[idx]*y[idx];
}
//Do a shared memory reduction on the dot product
__shared__ int red[BLOCK_SIZE];
red[threadIdx.x]=sum;
#pragma unroll
for (int n = 128; n > 0; n = n/2) { // incorrect results when syncthreads() are omitted in a wrap
__syncthreads();
if(threadIdx.x<n) {sum+=red[threadIdx.x+n]; red[threadIdx.x]=sum;}
}
//save partial dot products
if(threadIdx.x==0) d[blockIdx.x]=sum;
}
__global__ void final(int *d) {
int sum=d[threadIdx.x];
__shared__ int red[BLOCK_SIZE];
red[threadIdx.x]=sum;
#pragma unroll
for (int n = 128; n > 0; n = n/2) {
__syncthreads();
if(threadIdx.x<n) {sum+=red[threadIdx.x+n]; red[threadIdx.x]=sum;}
}
//save final dot product at the front
if(threadIdx.x==0) d[0]=sum;
}
#define LEN 1025
int main() {
int a[LEN];
int b[LEN];
int r[256];
srand(2);
int sum = 0;
int d_sum = 0;
// sum on the host
for (int i = 0; i < LEN; i++) {
a[i] = rand() % 3;
b[i] = rand() % 3;
sum += a[i]*b[i];
}
// sum on the device
int *da, *db;
int *dr;
const int n = LEN;
cudaMalloc((void**)&da, sizeof(int)*LEN);
cudaMalloc((void**)&db, sizeof(int)*LEN);
cudaMalloc((void**)&dr, sizeof(int)*256);
cudaMemcpy(da, a, sizeof(int)*LEN, cudaMemcpyHostToDevice);
cudaMemcpy(db, b, sizeof(int)*LEN, cudaMemcpyHostToDevice);
dot_kernel<<<(n+255)/256, 256 >>>(n, da,db,dr);
final<<<1, 256>>>(dr);
cudaMemcpy(&d_sum, dr, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d %d\n", sum ,d_sum);
cudaFree(da);
cudaFree(db);
cudaFree(dr);
return 0;
}
For the following codes in miniFE, the comments show that syncthreads() are not needed in a warp. However, I think syncthreads() are actually needed to produce correct sum results. I got incorrect sum results when omitting them. Could you reproduce the issue ? Thank you for your comments.