Optimizations - Githubissues

hoekkii commented 1 month ago

Changed the code a bit so the GPU would like it just a tiny bit more: Changed all possible data types to unsigned, I thought I saw a tiny improvement on d_maxOnes, but this should be analyzed more; mix and match these. Removed branches in the bit-checking and unrolled the dice-rolling. Added an additional check to break, as it seemed the GPU liked this more, accessing d_rolls after every atomicAdd seemed to slow things down (?).

#include <curand_kernel.h>
#include <iostream>

#define ROUNDS 1000000000
#define ROLLS 231
#define BLOCKSIZE 1024

__global__ void sim_rolls(unsigned int *d_maxOnes, unsigned long long *d_rolls, int seed) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= ROUNDS) return;
    unsigned long long idxe = ROUNDS - idx * 4;

    __shared__ unsigned int sharedMax[BLOCKSIZE];
    __shared__ unsigned int newMax;
    curandState state;
    curand_init(seed, idx, 0, &state);
    while(*d_rolls < idxe){
        unsigned int ones = 0;
        for (unsigned int i = 0; i < 14; ++i){
            unsigned int x = curand(&state);
            unsigned int s = x >> 1;
            unsigned int c = x & (~s) & 0x55555555;
            c = (c & 0x33333333) + ((c >> 2) & 0x33333333);
            c = (c + (c >> 4)) & 0x0F0F0F0F;
            c = c + (c >> 8);
            c = c + (c >> 16);
            ones += c & 0x0000003F;
        }

        {
            unsigned int x = curand(&state) & 0x00003FFF;
            unsigned int s = x >> 1;
            unsigned int c = x & (~s) & 0x55555555;
            c = (c & 0x33333333) + ((c >> 2) & 0x33333333);
            c = (c + (c >> 4)) & 0x0F0F0F0F;
            c = c + (c >> 8);
            ones += c & 0x0000003F;
        }

        sharedMax[threadIdx.x] = max(sharedMax[threadIdx.x], ones);

        unsigned long long rolls = atomicAdd(d_rolls, 1);
        if (rolls >= ROUNDS - 1) {
            break;
        }
    }

    if(threadIdx.x == 0) {
        for (unsigned int x=0; x<BLOCKSIZE; ++x) {
            newMax = max(sharedMax[x], newMax);
        }
        atomicMax(d_maxOnes, newMax);
    }
}

int main() {
    int maxOnes = 0;
    unsigned int *d_maxOnes;
    unsigned long long rolls = 0;
    unsigned long long *d_rolls;
    cudaDeviceProp prop;
    int deviceId;
    cudaGetDevice(&deviceId);
    cudaGetDeviceProperties(&prop, deviceId);
    int smCount = prop.multiProcessorCount;

    cudaMalloc(&d_maxOnes, sizeof(unsigned int));
    cudaMalloc(&d_rolls, sizeof(unsigned long long));

    cudaMemcpy(d_maxOnes, &maxOnes, sizeof(unsigned int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_rolls, &rolls, sizeof(unsigned long long), cudaMemcpyHostToDevice);

    int blockSize = BLOCKSIZE;
    int maxActiveBlocks = 0;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, sim_rolls, blockSize, 0);

    int numBlocks = smCount * maxActiveBlocks;

    float totalTime=0;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start,0);
    sim_rolls<<<numBlocks, blockSize, 0>>>(d_maxOnes, d_rolls, time(NULL));
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&totalTime, start, stop);

    // Copy the result back to system memory, now that the CUDA program is over
    cudaMemcpy(&maxOnes, d_maxOnes, sizeof(unsigned int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&rolls, d_rolls, sizeof(unsigned long long), cudaMemcpyDeviceToHost);

    // Report the important numbers
    std::cout << "Highest Ones Roll: " << maxOnes << std::endl;
    std::cout << "Number of Roll Sessions: " << rolls << std::endl;
    std::cout << totalTime << "ms" << std::endl;

    // Never malloc without a free
    cudaFree(d_maxOnes);
    cudaFree(d_rolls);
    return 0;
}

hoekkii commented 1 month ago

I reduced the number of operations a bit more:

#include <curand_kernel.h>
#include <stdio.h>

#define ROUNDS 1000000000
#define ROLLS 231
#define BLOCKSIZE 1024

__device__ static inline unsigned int
count_u128(uint4 x)
{
    uint4 s;
    uint4 c;
    uint4 count;

    s.x = x.x >> 1;
    s.y = x.y >> 1;
    s.z = x.z >> 1;
    s.w = x.w >> 1;

    c.x = (x.x & (~s.x)) & 0x55555555U;
    c.y = (x.y & (~s.y)) & 0x55555555U;
    c.z = (x.z & (~s.z)) & 0x55555555U;
    c.w = (x.w & (~s.w)) & 0x55555555U;

    return __popc(c.x) + __popc(c.y) + __popc(c.z) + __popc(c.w);
}

__device__ static inline unsigned int
count_u78(uint4 x)
{
    uint4 s;
    uint4 c;
    uint4 count;

    s.x = x.x >> 1;
    s.y = x.y >> 1;
    s.z = x.z >> 1;

    c.x = (x.x & (~s.x)) & 0x55555555U;
    c.y = (x.y & (~s.y)) & 0x55555555U;
    c.z = (x.z & (~s.z)) & 0x00001555U;

    return __popc(c.x) + __popc(c.y) + __popc(c.z);
}

__global__ void
sim_rolls(unsigned int* d_maxOnes, unsigned int* d_rolls, unsigned long long seed)
{
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= ROUNDS)
        return;

    __shared__ unsigned int shared_max[BLOCKSIZE];
    unsigned int rolls = *d_rolls;
    unsigned int idx_end = ROUNDS - idx * 4;
    unsigned int local_max = 0;

    curandStatePhilox4_32_10_t state;
    curand_init(seed, idx, 0, &state);
    while (rolls < idx_end)
    {
        unsigned int ones = 0;
        ones += count_u128(curand4(&state));
        ones += count_u128(curand4(&state));
        ones += count_u128(curand4(&state));
        ones += count_u78 (curand4(&state));
        local_max = max(local_max, ones);

        rolls = atomicAdd(d_rolls, 1);
        if (rolls >= ROUNDS - 1)
            break;
    }

    shared_max[threadIdx.x] = local_max;
    __syncthreads();

    if (threadIdx.x == 0)
    {
        unsigned int newMax = 0;
        for (unsigned int x = 0; x < BLOCKSIZE; ++x)
            newMax = max(shared_max[x], newMax);

        atomicMax(d_maxOnes, newMax);
    }
}

int
main()
{
    float lowest_time = 128;
    for (int i = 0; i < 32; i++)
    {
        unsigned int* d_maxOnes;
        unsigned int* d_rolls;
        unsigned int maxOnes = 0;
        unsigned int rolls = 0;

        cudaDeviceProp prop;
        int deviceId;

        cudaGetDevice(&deviceId);
        cudaGetDeviceProperties(&prop, deviceId);

        cudaMalloc(&d_maxOnes, sizeof(unsigned int));
        cudaMalloc(&d_rolls, sizeof(unsigned int));

        cudaMemcpy(d_maxOnes, &maxOnes, sizeof(unsigned int), cudaMemcpyHostToDevice);
        cudaMemcpy(d_rolls, &rolls, sizeof(unsigned int), cudaMemcpyHostToDevice);

        int maxActiveBlocks = 0;
        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, sim_rolls, BLOCKSIZE, 0);
        int numBlocks = prop.multiProcessorCount * maxActiveBlocks;

        float totalTime = 0;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
        sim_rolls<<<numBlocks, BLOCKSIZE, 0>>>(d_maxOnes, d_rolls, i + time(NULL));
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&totalTime, start, stop);

        cudaMemcpy(&maxOnes, d_maxOnes, sizeof(unsigned int), cudaMemcpyDeviceToHost);
        cudaMemcpy(&rolls, d_rolls, sizeof(unsigned int), cudaMemcpyDeviceToHost);
        printf(
            "Highest Ones Roll: %i\n"
            "Number of Roll Sessions: %i\n"
            "%f ms\n\n", maxOnes, rolls, totalTime);

        cudaFree(d_maxOnes);
        cudaFree(d_rolls);

        if (totalTime < lowest_time)
            lowest_time = totalTime;
    }

    printf("The lowest found time: %f ms\n\n", lowest_time);
    return 0;
}

hoekkii commented 4 weeks ago

Replaced curand4 with wyrand (credits to Qbit):

#include <curand_kernel.h>
#include <stdio.h>

#define ROUNDS 1000000000
#define BLOCKSIZE 1024

struct data_t
{
    unsigned int rolls;
    unsigned int max;
};

__device__ static inline unsigned int
count_u128(uint4 x)
{
    uint4 s {
        x.x >> 1,
        x.y >> 1,
        x.z >> 1,
        x.w >> 1,
    };

    uint4 c {
        (x.x & (~s.x)) & 0x55555555U,
        (x.y & (~s.y)) & 0x55555555U,
        (x.z & (~s.z)) & 0x55555555U,
        (x.w & (~s.w)) & 0x55555555U,
    };

    return __popc(c.x) + __popc(c.y) + __popc(c.z) + __popc(c.w);
}

__device__ static inline unsigned int
count_u78(uint4 x)
{
    uint3 s {
        x.x >> 1,
        x.y >> 1,
        x.z >> 1,
    };

    uint3 c {
        c.x = (x.x & (~s.x)) & 0x55555555U,
        c.y = (x.y & (~s.y)) & 0x55555555U,
        c.z = (x.z & (~s.z)) & 0x00001555U,
    };

    return __popc(c.x) + __popc(c.y) + __popc(c.z);
}

__device__ __forceinline__ uint4
gen_num(uint4* seed)
{
    uint4 s = *seed;
    s.x += 4189129037;
    s.y += 3329066731;
    s.z += 2310329797;
    s.w += 3146683069;
    *seed = s;

    uint4 tmp = uint4 {
        s.x ^ 3357668329,
        s.y ^ 3324289783,
        s.z ^ 3030601363,
        s.w ^ 3334762253
    };

    return uint4 {
        __funnelshift_l(__umulhi(s.x, tmp.x), s.x * tmp.x, 16),
        __funnelshift_l(__umulhi(s.y, tmp.y), s.y * tmp.y, 16),
        __funnelshift_l(__umulhi(s.z, tmp.z), s.z * tmp.z, 16),
        __funnelshift_l(__umulhi(s.w, tmp.w), s.w * tmp.w, 16)
    };
}

__global__ void
sim_rolls(data_t* data, unsigned long long seed)
{
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= ROUNDS)
        return;

    __shared__ unsigned int shared_max[BLOCKSIZE];
    unsigned int rolls = 0;
    unsigned int idx_end = ROUNDS - idx * 4 - 1;
    unsigned int local_max = 0;
    unsigned int ones;

    uint4 seed4 = {
        seed + idx * 3370454011,
        seed + blockIdx.x * 3718689019,
        seed + blockDim.x * 2835501367,
        seed + threadIdx.x * 3742026731
    };

    for (int i = 0; i < 7; i++)
        seed4 = gen_num(&seed4);

    while (rolls < idx_end)
    {
        ones  = count_u128(gen_num(&seed4));
        ones += count_u128(gen_num(&seed4));
        ones += count_u128(gen_num(&seed4));
        ones += count_u78 (gen_num(&seed4));
        rolls = atomicAdd(&data->rolls, 1);
        local_max = max(local_max, ones);
    }

    shared_max[threadIdx.x] = local_max;
    __syncthreads();

    if (threadIdx.x == 0)
    {
        unsigned int newMax = 0;
        for (unsigned int x = 0; x < BLOCKSIZE; ++x)
            newMax = max(shared_max[x], newMax);

        atomicMax(&data->max, newMax);
    }
}

int
main()
{
    unsigned int highest_roll_of_all_time = 0;
    float lowest_time = 128;
    for (int i = 0; i < 128; i++)
    {
        data_t* data_ptr;
        data_t data { };
        cudaDeviceProp prop;
        int deviceId;

        cudaGetDevice(&deviceId);
        cudaGetDeviceProperties(&prop, deviceId);

        cudaMalloc(&data_ptr, sizeof(data_t));
        cudaMemcpy(data_ptr, &data, sizeof(data_t), cudaMemcpyHostToDevice);

        int maxActiveBlocks = 0;
        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, sim_rolls, BLOCKSIZE, 0);
        int numBlocks = prop.multiProcessorCount * maxActiveBlocks;

        float totalTime = 0;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
        sim_rolls<<<numBlocks, BLOCKSIZE, 0>>>(data_ptr, i + time(NULL));
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&totalTime, start, stop);

        cudaMemcpy(&data, data_ptr, sizeof(data), cudaMemcpyDeviceToHost);
        printf(
            "Highest Ones Roll: %i\n"
            "Number of Roll Sessions: %i\n"
            "%f ms\n\n", data.max, data.rolls, totalTime);

        cudaFree(data_ptr);

        if (totalTime < lowest_time)
            lowest_time = totalTime;

        if (data.max > highest_roll_of_all_time)
            highest_roll_of_all_time = data.max;
    }

    printf(
        "The lowest found time: %f ms\n"
        "The highest rolls found: %i\n\n", lowest_time, highest_roll_of_all_time);

    return 0;
}

hoekkii commented 4 weeks ago

Removed unnecessary operations and do two iterations per loop to reduce the atomicAdd

#include <curand_kernel.h>
#include <stdio.h>

#define ROUNDS 1000000000
#define BLOCKSIZE 1024

struct data_t
{
    unsigned int rolls;
    unsigned int max;
};

__device__ __forceinline__ unsigned int
count_u128(uint4 x)
{
    return __popc((x.x & (~(x.x >> 1))) & 0x55555555U)
        +  __popc((x.y & (~(x.y >> 1))) & 0x55555555U)
        +  __popc((x.z & (~(x.z >> 1))) & 0x55555555U)
        +  __popc((x.w & (~(x.w >> 1))) & 0x55555555U);
}

__device__ __forceinline__ unsigned int
count_u78(uint3 x)
{
    return __popc((x.x & (~(x.x >> 1))) & 0x55555555U)
        +  __popc((x.y & (~(x.y >> 1))) & 0x55555555U)
        +  __popc((x.z & (~(x.z >> 1))) & 0x00001555U);
}

__device__ __forceinline__ uint4
gen_num(uint4* seed)
{
    uint4 s = *seed;
    s.x += 4189129037;
    s.y += 3329066731;
    s.z += 2310329797;
    s.w += 3146683069;
    *seed = s;

    uint4 tmp = uint4 {
        s.x ^ 3357668329,
        s.y ^ 3324289783,
        s.z ^ 3030601363,
        s.w ^ 3334762253
    };

    return uint4 {
        (__umulhi(s.x, tmp.x) ^ (s.x * tmp.x)),
        (__umulhi(s.y, tmp.y) ^ (s.y * tmp.y)),
        (__umulhi(s.z, tmp.z) ^ (s.z * tmp.z)),
        (__umulhi(s.w, tmp.w) ^ (s.w * tmp.w))
    };
}

__device__ __forceinline__ uint3
gen_num3(uint4* seed)
{
    uint4 s = *seed;
    s.x += 4189129037;
    s.y += 3329066731;
    s.z += 2310329797;
    *seed = s;

    uint3 tmp = uint3{
        s.x ^ 3357668329,
        s.y ^ 3324289783,
        s.z ^ 3030601363,
    };

    return uint3{
        (__umulhi(s.x, tmp.x) ^ (s.x * tmp.x)),
        (__umulhi(s.y, tmp.y) ^ (s.y * tmp.y)),
        (__umulhi(s.z, tmp.z) ^ (s.z * tmp.z))
    };
}

__global__ void
sim_rolls(data_t* data, unsigned long long seed)
{
    __shared__ unsigned int shared_max[BLOCKSIZE];
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int idx_end = ROUNDS - idx * 8 - 2;
    unsigned int rolls = 0;
    unsigned int local_max = 0;
    unsigned int ones;

    uint4 seed4 = {
        seed + idx * 3370454011,
        seed + blockIdx.x * 3718689019,
        seed + blockDim.x * 2835501367,
        seed + threadIdx.x * 3742026731
    };

    while (rolls < idx_end)
    {
        ones  = count_u128(gen_num(&seed4));
        ones += count_u128(gen_num(&seed4));
        ones += count_u128(gen_num(&seed4));
        ones += count_u78(gen_num3(&seed4));
        local_max = max(local_max, ones);

        ones = count_u128(gen_num(&seed4));
        ones += count_u128(gen_num(&seed4));
        ones += count_u128(gen_num(&seed4));
        ones += count_u78(gen_num3(&seed4));
        local_max = max(local_max, ones);

        rolls = atomicAdd(&data->rolls, 2);
    }

    shared_max[threadIdx.x] = local_max;
    __syncthreads();

    if (threadIdx.x == 0)
    {
        unsigned int newMax = 0;
        for (unsigned int x = 0; x < BLOCKSIZE; ++x)
            newMax = max(shared_max[x], newMax);

        atomicMax(&data->max, newMax);
    }
}

int
main()
{
    unsigned int highest_roll_of_all_time = 0;
    float lowest_time = 128;
    data_t* data_ptr;
    cudaDeviceProp prop;
    int deviceId;
    cudaGetDevice(&deviceId);
    cudaGetDeviceProperties(&prop, deviceId);
    cudaMalloc(&data_ptr, sizeof(data_t));

    int maxActiveBlocks = 0;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, sim_rolls, BLOCKSIZE, 0);
    int numBlocks = prop.multiProcessorCount * maxActiveBlocks;
    printf("numBlocks: %i\n", numBlocks);

    for (int i = 0; i < 32; i++)
    {
        data_t data { };
        cudaMemcpy(data_ptr, &data, sizeof(data_t), cudaMemcpyHostToDevice);

        float totalTime;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
        sim_rolls<<<numBlocks, BLOCKSIZE, 0>>>(data_ptr, i + time(NULL));
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&totalTime, start, stop);

        cudaMemcpy(&data, data_ptr, sizeof(data), cudaMemcpyDeviceToHost);
        printf(
            "Highest Ones Roll: %i\n"
            "Number of Roll Sessions: %i\n"
            "%f ms\n\n", data.max, data.rolls, totalTime);

        if (totalTime < lowest_time)
            lowest_time = totalTime;

        if (data.max > highest_roll_of_all_time)
            highest_roll_of_all_time = data.max;
    }

    printf(
        "The lowest found time: %f ms\n"
        "The highest rolls found: %i\n\n", lowest_time, highest_roll_of_all_time);

    cudaFree(data_ptr);
    return 0;
}

hoekkii commented 4 weeks ago

Apparently SIMD is not a thing on uint4 types in CUDA. Therefore the previous attempts caused less problems in the dependency chain. Therefore more focus should go there. I made some changes and this is the 8.5ms solution:

#include <curand_kernel.h>
#include <stdio.h>

#define ROUNDS 1000000000
#define BLOCKSIZE 1024

struct data_t
{
    unsigned int rolls;
    unsigned int max;
};

__device__ __forceinline__ unsigned int
gencalc_14(const unsigned int s)
{
    unsigned int x = (__umulhi(s, s ^ 3357668329) ^ ((s ^ 3164281891) * s));
    return __popc((x & (~(x >> 1))) & 0x00001555U);
}

__device__ __forceinline__ unsigned int
gencalc_32(const unsigned int s)
{
    unsigned int x = (__umulhi(s, s ^ 3357668329) ^ ((s ^ 3164281891) * s));
    return __popc((x & (~(x >> 1))) & 0x55555555U);
}

__device__ __forceinline__ unsigned int
genones(unsigned int seed0)
{
    unsigned int ones;
    ones  = gencalc_32(seed0 + 2218115371);
    ones += gencalc_32(seed0 + 2512054063);
    ones += gencalc_32(seed0 + 3726238001);
    ones += gencalc_32(seed0 + 2416398973);

    ones += gencalc_32(seed0 + 2191615213);
    ones += gencalc_32(seed0 + 3679519709);
    ones += gencalc_32(seed0 + 2951526817);
    ones += gencalc_32(seed0 + 3741703157);

    ones += gencalc_32(seed0 + 3513381479);
    ones += gencalc_32(seed0 + 3456220219);
    ones += gencalc_32(seed0 + 2404646063);
    ones += gencalc_32(seed0 + 2335906691);

    ones += gencalc_32(seed0 + 3698060357);
    ones += gencalc_32(seed0 + 3115986383);
    ones += gencalc_14(seed0 + 2512054063);
    return ones;
}

__global__ void
sim_rolls(data_t* data, unsigned int seed)
{
    __shared__ unsigned int shared_max[BLOCKSIZE];
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int idx_end = ROUNDS - idx * 8 - 2;
    unsigned int seed0 = seed + idx * 3370454011;
    unsigned int seed1 = seed + blockIdx.x * 3718689019;
    unsigned int rolls = 0;
    unsigned int local_max = 0;

    while (rolls < idx_end)
    {
        local_max = max(local_max, genones(seed0 += 3638385337));
        local_max = max(local_max, genones(seed1 += 3096000953));
        rolls = atomicAdd(&data->rolls, 2);
    }

    shared_max[threadIdx.x] = local_max;
    __syncthreads();

    if (threadIdx.x == 0)
    {
        unsigned int newMax = 0;
        for (unsigned int x = 0; x < BLOCKSIZE; ++x)
            newMax = max(shared_max[x], newMax);

        atomicMax(&data->max, newMax);
    }
}

int
main()
{
    unsigned int highest_roll_of_all_time = 0;
    float lowest_time = 128;

    for (int i = 0; i < 32; i++)
    {
        data_t* data_ptr;
        cudaDeviceProp prop;
        int deviceId;
        cudaGetDevice(&deviceId);
        cudaGetDeviceProperties(&prop, deviceId);
        cudaMalloc(&data_ptr, sizeof(data_t));

        int maxActiveBlocks = 0;
        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks, sim_rolls, BLOCKSIZE, 0);
        int numBlocks = prop.multiProcessorCount * maxActiveBlocks;
        data_t data { };
        cudaMemcpy(data_ptr, &data, sizeof(data_t), cudaMemcpyHostToDevice);

        float totalTime;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
        unsigned long long seed = i * 3096000953ULL + time(NULL);
        sim_rolls<<<numBlocks, BLOCKSIZE, 0>>>(data_ptr, *(unsigned int*)&seed);
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&totalTime, start, stop);

        cudaMemcpy(&data, data_ptr, sizeof(data), cudaMemcpyDeviceToHost);
        printf(
            "Highest Ones Roll: %i\n"
            "Number of Roll Sessions: %i\n"
            "%f ms\n\n", data.max, data.rolls, totalTime);

        if (totalTime < lowest_time)
            lowest_time = totalTime;

        if (data.max > highest_roll_of_all_time)
            highest_roll_of_all_time = data.max;

        cudaFree(data_ptr);
    }

    printf(
        "The lowest found time: %f ms\n"
        "The highest rolls found: %i\n\n", lowest_time, highest_roll_of_all_time);

    return 0;
}

LokiFawkes / graveler

Optimizations #1