Open hoekkii opened 1 month ago
I reduced the number of operations a bit more:
#include <curand_kernel.h>
#include <stdio.h>
#define ROUNDS 1000000000
#define ROLLS 231
#define BLOCKSIZE 1024
__device__ static inline unsigned int
count_u128(uint4 x)
{
uint4 s;
uint4 c;
uint4 count;
s.x = x.x >> 1;
s.y = x.y >> 1;
s.z = x.z >> 1;
s.w = x.w >> 1;
c.x = (x.x & (~s.x)) & 0x55555555U;
c.y = (x.y & (~s.y)) & 0x55555555U;
c.z = (x.z & (~s.z)) & 0x55555555U;
c.w = (x.w & (~s.w)) & 0x55555555U;
return __popc(c.x) + __popc(c.y) + __popc(c.z) + __popc(c.w);
}
__device__ static inline unsigned int
count_u78(uint4 x)
{
uint4 s;
uint4 c;
uint4 count;
s.x = x.x >> 1;
s.y = x.y >> 1;
s.z = x.z >> 1;
c.x = (x.x & (~s.x)) & 0x55555555U;
c.y = (x.y & (~s.y)) & 0x55555555U;
c.z = (x.z & (~s.z)) & 0x00001555U;
return __popc(c.x) + __popc(c.y) + __popc(c.z);
}
__global__ void
sim_rolls(unsigned int* d_maxOnes, unsigned int* d_rolls, unsigned long long seed)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= ROUNDS)
return;
__shared__ unsigned int shared_max[BLOCKSIZE];
unsigned int rolls = *d_rolls;
unsigned int idx_end = ROUNDS - idx * 4;
unsigned int local_max = 0;
curandStatePhilox4_32_10_t state;
curand_init(seed, idx, 0, &state);
while (rolls < idx_end)
{
unsigned int ones = 0;
ones += count_u128(curand4(&state));
ones += count_u128(curand4(&state));
ones += count_u128(curand4(&state));
ones += count_u78 (curand4(&state));
local_max = max(local_max, ones);
rolls = atomicAdd(d_rolls, 1);
if (rolls >= ROUNDS - 1)
break;
}
shared_max[threadIdx.x] = local_max;
__syncthreads();
if (threadIdx.x == 0)
{
unsigned int newMax = 0;
for (unsigned int x = 0; x < BLOCKSIZE; ++x)
newMax = max(shared_max[x], newMax);
atomicMax(d_maxOnes, newMax);
}
}
int
main()
{
float lowest_time = 128;
for (int i = 0; i < 32; i++)
{
unsigned int* d_maxOnes;
unsigned int* d_rolls;
unsigned int maxOnes = 0;
unsigned int rolls = 0;
cudaDeviceProp prop;
int deviceId;
cudaGetDevice(&deviceId);
cudaGetDeviceProperties(&prop, deviceId);
cudaMalloc(&d_maxOnes, sizeof(unsigned int));
cudaMalloc(&d_rolls, sizeof(unsigned int));
cudaMemcpy(d_maxOnes, &maxOnes, sizeof(unsigned int), cudaMemcpyHostToDevice);
cudaMemcpy(d_rolls, &rolls, sizeof(unsigned int), cudaMemcpyHostToDevice);
int maxActiveBlocks = 0;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, sim_rolls, BLOCKSIZE, 0);
int numBlocks = prop.multiProcessorCount * maxActiveBlocks;
float totalTime = 0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
sim_rolls<<<numBlocks, BLOCKSIZE, 0>>>(d_maxOnes, d_rolls, i + time(NULL));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&totalTime, start, stop);
cudaMemcpy(&maxOnes, d_maxOnes, sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaMemcpy(&rolls, d_rolls, sizeof(unsigned int), cudaMemcpyDeviceToHost);
printf(
"Highest Ones Roll: %i\n"
"Number of Roll Sessions: %i\n"
"%f ms\n\n", maxOnes, rolls, totalTime);
cudaFree(d_maxOnes);
cudaFree(d_rolls);
if (totalTime < lowest_time)
lowest_time = totalTime;
}
printf("The lowest found time: %f ms\n\n", lowest_time);
return 0;
}
Replaced curand4 with wyrand (credits to Qbit):
#include <curand_kernel.h>
#include <stdio.h>
#define ROUNDS 1000000000
#define BLOCKSIZE 1024
struct data_t
{
unsigned int rolls;
unsigned int max;
};
__device__ static inline unsigned int
count_u128(uint4 x)
{
uint4 s {
x.x >> 1,
x.y >> 1,
x.z >> 1,
x.w >> 1,
};
uint4 c {
(x.x & (~s.x)) & 0x55555555U,
(x.y & (~s.y)) & 0x55555555U,
(x.z & (~s.z)) & 0x55555555U,
(x.w & (~s.w)) & 0x55555555U,
};
return __popc(c.x) + __popc(c.y) + __popc(c.z) + __popc(c.w);
}
__device__ static inline unsigned int
count_u78(uint4 x)
{
uint3 s {
x.x >> 1,
x.y >> 1,
x.z >> 1,
};
uint3 c {
c.x = (x.x & (~s.x)) & 0x55555555U,
c.y = (x.y & (~s.y)) & 0x55555555U,
c.z = (x.z & (~s.z)) & 0x00001555U,
};
return __popc(c.x) + __popc(c.y) + __popc(c.z);
}
__device__ __forceinline__ uint4
gen_num(uint4* seed)
{
uint4 s = *seed;
s.x += 4189129037;
s.y += 3329066731;
s.z += 2310329797;
s.w += 3146683069;
*seed = s;
uint4 tmp = uint4 {
s.x ^ 3357668329,
s.y ^ 3324289783,
s.z ^ 3030601363,
s.w ^ 3334762253
};
return uint4 {
__funnelshift_l(__umulhi(s.x, tmp.x), s.x * tmp.x, 16),
__funnelshift_l(__umulhi(s.y, tmp.y), s.y * tmp.y, 16),
__funnelshift_l(__umulhi(s.z, tmp.z), s.z * tmp.z, 16),
__funnelshift_l(__umulhi(s.w, tmp.w), s.w * tmp.w, 16)
};
}
__global__ void
sim_rolls(data_t* data, unsigned long long seed)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= ROUNDS)
return;
__shared__ unsigned int shared_max[BLOCKSIZE];
unsigned int rolls = 0;
unsigned int idx_end = ROUNDS - idx * 4 - 1;
unsigned int local_max = 0;
unsigned int ones;
uint4 seed4 = {
seed + idx * 3370454011,
seed + blockIdx.x * 3718689019,
seed + blockDim.x * 2835501367,
seed + threadIdx.x * 3742026731
};
for (int i = 0; i < 7; i++)
seed4 = gen_num(&seed4);
while (rolls < idx_end)
{
ones = count_u128(gen_num(&seed4));
ones += count_u128(gen_num(&seed4));
ones += count_u128(gen_num(&seed4));
ones += count_u78 (gen_num(&seed4));
rolls = atomicAdd(&data->rolls, 1);
local_max = max(local_max, ones);
}
shared_max[threadIdx.x] = local_max;
__syncthreads();
if (threadIdx.x == 0)
{
unsigned int newMax = 0;
for (unsigned int x = 0; x < BLOCKSIZE; ++x)
newMax = max(shared_max[x], newMax);
atomicMax(&data->max, newMax);
}
}
int
main()
{
unsigned int highest_roll_of_all_time = 0;
float lowest_time = 128;
for (int i = 0; i < 128; i++)
{
data_t* data_ptr;
data_t data { };
cudaDeviceProp prop;
int deviceId;
cudaGetDevice(&deviceId);
cudaGetDeviceProperties(&prop, deviceId);
cudaMalloc(&data_ptr, sizeof(data_t));
cudaMemcpy(data_ptr, &data, sizeof(data_t), cudaMemcpyHostToDevice);
int maxActiveBlocks = 0;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, sim_rolls, BLOCKSIZE, 0);
int numBlocks = prop.multiProcessorCount * maxActiveBlocks;
float totalTime = 0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
sim_rolls<<<numBlocks, BLOCKSIZE, 0>>>(data_ptr, i + time(NULL));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&totalTime, start, stop);
cudaMemcpy(&data, data_ptr, sizeof(data), cudaMemcpyDeviceToHost);
printf(
"Highest Ones Roll: %i\n"
"Number of Roll Sessions: %i\n"
"%f ms\n\n", data.max, data.rolls, totalTime);
cudaFree(data_ptr);
if (totalTime < lowest_time)
lowest_time = totalTime;
if (data.max > highest_roll_of_all_time)
highest_roll_of_all_time = data.max;
}
printf(
"The lowest found time: %f ms\n"
"The highest rolls found: %i\n\n", lowest_time, highest_roll_of_all_time);
return 0;
}
Removed unnecessary operations and do two iterations per loop to reduce the atomicAdd
#include <curand_kernel.h>
#include <stdio.h>
#define ROUNDS 1000000000
#define BLOCKSIZE 1024
struct data_t
{
unsigned int rolls;
unsigned int max;
};
__device__ __forceinline__ unsigned int
count_u128(uint4 x)
{
return __popc((x.x & (~(x.x >> 1))) & 0x55555555U)
+ __popc((x.y & (~(x.y >> 1))) & 0x55555555U)
+ __popc((x.z & (~(x.z >> 1))) & 0x55555555U)
+ __popc((x.w & (~(x.w >> 1))) & 0x55555555U);
}
__device__ __forceinline__ unsigned int
count_u78(uint3 x)
{
return __popc((x.x & (~(x.x >> 1))) & 0x55555555U)
+ __popc((x.y & (~(x.y >> 1))) & 0x55555555U)
+ __popc((x.z & (~(x.z >> 1))) & 0x00001555U);
}
__device__ __forceinline__ uint4
gen_num(uint4* seed)
{
uint4 s = *seed;
s.x += 4189129037;
s.y += 3329066731;
s.z += 2310329797;
s.w += 3146683069;
*seed = s;
uint4 tmp = uint4 {
s.x ^ 3357668329,
s.y ^ 3324289783,
s.z ^ 3030601363,
s.w ^ 3334762253
};
return uint4 {
(__umulhi(s.x, tmp.x) ^ (s.x * tmp.x)),
(__umulhi(s.y, tmp.y) ^ (s.y * tmp.y)),
(__umulhi(s.z, tmp.z) ^ (s.z * tmp.z)),
(__umulhi(s.w, tmp.w) ^ (s.w * tmp.w))
};
}
__device__ __forceinline__ uint3
gen_num3(uint4* seed)
{
uint4 s = *seed;
s.x += 4189129037;
s.y += 3329066731;
s.z += 2310329797;
*seed = s;
uint3 tmp = uint3{
s.x ^ 3357668329,
s.y ^ 3324289783,
s.z ^ 3030601363,
};
return uint3{
(__umulhi(s.x, tmp.x) ^ (s.x * tmp.x)),
(__umulhi(s.y, tmp.y) ^ (s.y * tmp.y)),
(__umulhi(s.z, tmp.z) ^ (s.z * tmp.z))
};
}
__global__ void
sim_rolls(data_t* data, unsigned long long seed)
{
__shared__ unsigned int shared_max[BLOCKSIZE];
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idx_end = ROUNDS - idx * 8 - 2;
unsigned int rolls = 0;
unsigned int local_max = 0;
unsigned int ones;
uint4 seed4 = {
seed + idx * 3370454011,
seed + blockIdx.x * 3718689019,
seed + blockDim.x * 2835501367,
seed + threadIdx.x * 3742026731
};
while (rolls < idx_end)
{
ones = count_u128(gen_num(&seed4));
ones += count_u128(gen_num(&seed4));
ones += count_u128(gen_num(&seed4));
ones += count_u78(gen_num3(&seed4));
local_max = max(local_max, ones);
ones = count_u128(gen_num(&seed4));
ones += count_u128(gen_num(&seed4));
ones += count_u128(gen_num(&seed4));
ones += count_u78(gen_num3(&seed4));
local_max = max(local_max, ones);
rolls = atomicAdd(&data->rolls, 2);
}
shared_max[threadIdx.x] = local_max;
__syncthreads();
if (threadIdx.x == 0)
{
unsigned int newMax = 0;
for (unsigned int x = 0; x < BLOCKSIZE; ++x)
newMax = max(shared_max[x], newMax);
atomicMax(&data->max, newMax);
}
}
int
main()
{
unsigned int highest_roll_of_all_time = 0;
float lowest_time = 128;
data_t* data_ptr;
cudaDeviceProp prop;
int deviceId;
cudaGetDevice(&deviceId);
cudaGetDeviceProperties(&prop, deviceId);
cudaMalloc(&data_ptr, sizeof(data_t));
int maxActiveBlocks = 0;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, sim_rolls, BLOCKSIZE, 0);
int numBlocks = prop.multiProcessorCount * maxActiveBlocks;
printf("numBlocks: %i\n", numBlocks);
for (int i = 0; i < 32; i++)
{
data_t data { };
cudaMemcpy(data_ptr, &data, sizeof(data_t), cudaMemcpyHostToDevice);
float totalTime;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
sim_rolls<<<numBlocks, BLOCKSIZE, 0>>>(data_ptr, i + time(NULL));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&totalTime, start, stop);
cudaMemcpy(&data, data_ptr, sizeof(data), cudaMemcpyDeviceToHost);
printf(
"Highest Ones Roll: %i\n"
"Number of Roll Sessions: %i\n"
"%f ms\n\n", data.max, data.rolls, totalTime);
if (totalTime < lowest_time)
lowest_time = totalTime;
if (data.max > highest_roll_of_all_time)
highest_roll_of_all_time = data.max;
}
printf(
"The lowest found time: %f ms\n"
"The highest rolls found: %i\n\n", lowest_time, highest_roll_of_all_time);
cudaFree(data_ptr);
return 0;
}
Apparently SIMD is not a thing on uint4 types in CUDA. Therefore the previous attempts caused less problems in the dependency chain. Therefore more focus should go there. I made some changes and this is the 8.5ms solution:
#include <curand_kernel.h>
#include <stdio.h>
#define ROUNDS 1000000000
#define BLOCKSIZE 1024
struct data_t
{
unsigned int rolls;
unsigned int max;
};
__device__ __forceinline__ unsigned int
gencalc_14(const unsigned int s)
{
unsigned int x = (__umulhi(s, s ^ 3357668329) ^ ((s ^ 3164281891) * s));
return __popc((x & (~(x >> 1))) & 0x00001555U);
}
__device__ __forceinline__ unsigned int
gencalc_32(const unsigned int s)
{
unsigned int x = (__umulhi(s, s ^ 3357668329) ^ ((s ^ 3164281891) * s));
return __popc((x & (~(x >> 1))) & 0x55555555U);
}
__device__ __forceinline__ unsigned int
genones(unsigned int seed0)
{
unsigned int ones;
ones = gencalc_32(seed0 + 2218115371);
ones += gencalc_32(seed0 + 2512054063);
ones += gencalc_32(seed0 + 3726238001);
ones += gencalc_32(seed0 + 2416398973);
ones += gencalc_32(seed0 + 2191615213);
ones += gencalc_32(seed0 + 3679519709);
ones += gencalc_32(seed0 + 2951526817);
ones += gencalc_32(seed0 + 3741703157);
ones += gencalc_32(seed0 + 3513381479);
ones += gencalc_32(seed0 + 3456220219);
ones += gencalc_32(seed0 + 2404646063);
ones += gencalc_32(seed0 + 2335906691);
ones += gencalc_32(seed0 + 3698060357);
ones += gencalc_32(seed0 + 3115986383);
ones += gencalc_14(seed0 + 2512054063);
return ones;
}
__global__ void
sim_rolls(data_t* data, unsigned int seed)
{
__shared__ unsigned int shared_max[BLOCKSIZE];
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idx_end = ROUNDS - idx * 8 - 2;
unsigned int seed0 = seed + idx * 3370454011;
unsigned int seed1 = seed + blockIdx.x * 3718689019;
unsigned int rolls = 0;
unsigned int local_max = 0;
while (rolls < idx_end)
{
local_max = max(local_max, genones(seed0 += 3638385337));
local_max = max(local_max, genones(seed1 += 3096000953));
rolls = atomicAdd(&data->rolls, 2);
}
shared_max[threadIdx.x] = local_max;
__syncthreads();
if (threadIdx.x == 0)
{
unsigned int newMax = 0;
for (unsigned int x = 0; x < BLOCKSIZE; ++x)
newMax = max(shared_max[x], newMax);
atomicMax(&data->max, newMax);
}
}
int
main()
{
unsigned int highest_roll_of_all_time = 0;
float lowest_time = 128;
for (int i = 0; i < 32; i++)
{
data_t* data_ptr;
cudaDeviceProp prop;
int deviceId;
cudaGetDevice(&deviceId);
cudaGetDeviceProperties(&prop, deviceId);
cudaMalloc(&data_ptr, sizeof(data_t));
int maxActiveBlocks = 0;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, sim_rolls, BLOCKSIZE, 0);
int numBlocks = prop.multiProcessorCount * maxActiveBlocks;
data_t data { };
cudaMemcpy(data_ptr, &data, sizeof(data_t), cudaMemcpyHostToDevice);
float totalTime;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
unsigned long long seed = i * 3096000953ULL + time(NULL);
sim_rolls<<<numBlocks, BLOCKSIZE, 0>>>(data_ptr, *(unsigned int*)&seed);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&totalTime, start, stop);
cudaMemcpy(&data, data_ptr, sizeof(data), cudaMemcpyDeviceToHost);
printf(
"Highest Ones Roll: %i\n"
"Number of Roll Sessions: %i\n"
"%f ms\n\n", data.max, data.rolls, totalTime);
if (totalTime < lowest_time)
lowest_time = totalTime;
if (data.max > highest_roll_of_all_time)
highest_roll_of_all_time = data.max;
cudaFree(data_ptr);
}
printf(
"The lowest found time: %f ms\n"
"The highest rolls found: %i\n\n", lowest_time, highest_roll_of_all_time);
return 0;
}
Changed the code a bit so the GPU would like it just a tiny bit more: Changed all possible data types to unsigned, I thought I saw a tiny improvement on d_maxOnes, but this should be analyzed more; mix and match these. Removed branches in the bit-checking and unrolled the dice-rolling. Added an additional check to break, as it seemed the GPU liked this more, accessing d_rolls after every atomicAdd seemed to slow things down (?).