#include <cstdio>
#include <hip/hip_runtime.h>
__global__ void my_kernel(int * data_in, int * data_out)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
data_out[idx] = __shfl_down(data_in[idx], 16);
}
int main()
{
int count = 1024;
int * data_in;
int * data_out;
hipMallocManaged((void**)&data_in, count * sizeof(int));
hipMallocManaged((void**)&data_out, count * sizeof(int));
for(int i = 0; i < count; i++)
{
data_in[i] = i;
data_out[i] = -1;
}
printf("Input:");
for(int i = 0; i < count; i++)
{
if(i % 32 == 0)
printf("\n");
printf("%5d ", data_in[i]);
}
printf("\n");
int tpb = 256;
int bpg = count / tpb;
hipLaunchKernelGGL(my_kernel, bpg, tpb, 0, 0, data_in, data_out);
hipDeviceSynchronize();
printf("Output:");
for(int i = 0; i < count; i++)
{
if(i % 32 == 0)
printf("\n");
printf("%5d ", data_out[i]);
}
printf("\n");
printf("Diff:");
for(int i = 0; i < count; i++)
{
if(i % 32 == 0)
printf("\n");
printf("%5d ", data_out[i] - data_in[i]);
}
printf("\n");
hipFree(data_in);
hipFree(data_out);
return 0;
}
The only thing the program does, is it shifts the values from the input buffer and stores the shifted data to the output buffer, using warp shuffle function __shfl_down.
Compiling it using hipcc and running it on the GPU produces expected results, the data in each warp are shifted by 16 values, except for the last 16 values in the warp, which maintained their original value.
But using the HIP-CPU library, the results are incorrect. It seems that the warp shuffle is performed only in the first warp of each threadblock.
The function __shfl_up has similarly incorrect behaviour, and even weirder.
__shfl_xor seems totally wrong, it works as if just __shfl was used, forgetting about the xor.
__shfl behavior seems ok.
Note that I am not assuming any warpSize, if the warpSize was anything, the output would still be wrong.
Compilation of the HIP-CPU program was performed using the command
Consider the following HIP program:
The only thing the program does, is it shifts the values from the input buffer and stores the shifted data to the output buffer, using warp shuffle function
__shfl_down
.Compiling it using
hipcc
and running it on the GPU produces expected results, the data in each warp are shifted by 16 values, except for the last 16 values in the warp, which maintained their original value.But using the HIP-CPU library, the results are incorrect. It seems that the warp shuffle is performed only in the first warp of each threadblock. The function
__shfl_up
has similarly incorrect behaviour, and even weirder.__shfl_xor
seems totally wrong, it works as if just__shfl
was used, forgetting about the xor.__shfl
behavior seems ok.Note that I am not assuming any warpSize, if the warpSize was anything, the output would still be wrong.
Compilation of the HIP-CPU program was performed using the command
I am attaching the outputs of the programs. The GPU runs were on an AMD GPU. out_down_hipCpu.txt out_down_hipGpu.txt out_up_hipCpu.txt out_up_hipGpu.txt out_xor_hipCpu.txt out_xor_hipGpu.txt