__syncthreads() needed for reduction ?

For the following codes in miniFE, the comments show that syncthreads() are not needed in a warp. However, I think syncthreads() are actually needed to produce correct sum results. I got incorrect sum results when omitting them. Could you reproduce the issue ? Thank you for your comments.

template<typename Vector>
__global__ void dot_kernel(const Vector x, const Vector y, typename TypeTraits<typename Vector::ScalarType>::magnitude_type *d) {

  typedef typename TypeTraits<typename Vector::ScalarType>::magnitude_type magnitude;
  const int BLOCK_SIZE=512;

  magnitude sum=0;
  for(int idx=blockIdx.x*blockDim.x+threadIdx.x;idx<x.n;idx+=gridDim.x*blockDim.x) {
    sum+=x.coefs[idx]*y.coefs[idx];
  }

  //Do a shared memory reduction on the dot product
  __shared__ volatile magnitude red[BLOCK_SIZE];
  red[threadIdx.x]=sum;
  //__syncthreads(); if(threadIdx.x<512) {sum+=red[threadIdx.x+512]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<256)  {sum+=red[threadIdx.x+256]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<128)  {sum+=red[threadIdx.x+128]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<64)   {sum+=red[threadIdx.x+64];  red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<32)   {sum+=red[threadIdx.x+32];  red[threadIdx.x]=sum;}
  //the remaining ones don't need syncthreads because they are warp synchronous
                   if(threadIdx.x<16)   {sum+=red[threadIdx.x+16];  red[threadIdx.x]=sum;}
                   if(threadIdx.x<8)    {sum+=red[threadIdx.x+8];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<4)    {sum+=red[threadIdx.x+4];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<2)    {sum+=red[threadIdx.x+2];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<1)    {sum+=red[threadIdx.x+1];}

  //save partial dot products
  if(threadIdx.x==0) d[blockIdx.x]=sum;
}

template<typename Scalar>
__global__ void dot_final_reduce_kernel(Scalar *d) {
  const int BLOCK_SIZE=1024;
  Scalar sum=d[threadIdx.x];
  __shared__ volatile Scalar red[BLOCK_SIZE];

  red[threadIdx.x]=sum;
  __syncthreads(); if(threadIdx.x<512)  {sum+=red[threadIdx.x+512]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<256)  {sum+=red[threadIdx.x+256]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<128)  {sum+=red[threadIdx.x+128]; red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<64)   {sum+=red[threadIdx.x+64];  red[threadIdx.x]=sum;}
  __syncthreads(); if(threadIdx.x<32)   {sum+=red[threadIdx.x+32];  red[threadIdx.x]=sum;}
  //the remaining ones don't need syncthreads because they are warp synchronous
                   if(threadIdx.x<16)   {sum+=red[threadIdx.x+16];  red[threadIdx.x]=sum;}
                   if(threadIdx.x<8)    {sum+=red[threadIdx.x+8];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<4)    {sum+=red[threadIdx.x+4];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<2)    {sum+=red[threadIdx.x+2];   red[threadIdx.x]=sum;}
                   if(threadIdx.x<1)    {sum+=red[threadIdx.x+1];}

  //save final dot product at the front
                   if(threadIdx.x==0) d[0]=sum;
}

#define BLOCK_SIZE  256

#include <stdio.h>
#include <cuda.h>

__global__ void dot_kernel(const int n, const int* x, const int* y, int *d) {

  int sum=0;
  for(int idx=blockIdx.x*blockDim.x+threadIdx.x;idx<n;idx+=gridDim.x*blockDim.x) {
    sum+=x[idx]*y[idx];
  }

  //Do a shared memory reduction on the dot product
  __shared__ int red[BLOCK_SIZE];
  red[threadIdx.x]=sum;
  #pragma unroll
  for (int n = 128; n > 0; n = n/2) {   // incorrect results when syncthreads() are omitted in a wrap
     __syncthreads();
     if(threadIdx.x<n)  {sum+=red[threadIdx.x+n]; red[threadIdx.x]=sum;}
  }

  //save partial dot products
  if(threadIdx.x==0) d[blockIdx.x]=sum;
}

__global__ void final(int *d) {
  int sum=d[threadIdx.x];
  __shared__ int red[BLOCK_SIZE];

  red[threadIdx.x]=sum;
  #pragma unroll
  for (int n = 128; n > 0; n = n/2) {    
     __syncthreads();
     if(threadIdx.x<n)  {sum+=red[threadIdx.x+n]; red[threadIdx.x]=sum;}
  }
  //save final dot product at the front
  if(threadIdx.x==0) d[0]=sum;
}

#define LEN 1025
int main() {
  int a[LEN];
  int b[LEN];
  int r[256];
  srand(2);
  int sum = 0;
  int d_sum = 0;

// sum on the host
  for (int i = 0; i < LEN; i++) {
    a[i] = rand() % 3;
    b[i] = rand() % 3;
    sum += a[i]*b[i];
  }

// sum on the device
  int *da, *db;
  int *dr;
  const int n = LEN;
  cudaMalloc((void**)&da, sizeof(int)*LEN);
  cudaMalloc((void**)&db, sizeof(int)*LEN);
  cudaMalloc((void**)&dr, sizeof(int)*256);
  cudaMemcpy(da, a, sizeof(int)*LEN, cudaMemcpyHostToDevice);
  cudaMemcpy(db, b, sizeof(int)*LEN, cudaMemcpyHostToDevice);
  dot_kernel<<<(n+255)/256, 256 >>>(n, da,db,dr);
  final<<<1, 256>>>(dr);
  cudaMemcpy(&d_sum, dr, sizeof(int), cudaMemcpyDeviceToHost);
  printf("%d %d\n", sum ,d_sum);
  cudaFree(da);
  cudaFree(db);
  cudaFree(dr);
  return 0;
}

Mantevo / miniFE

__syncthreads() needed for reduction ? #13