Open weilewei opened 7 months ago
the prefix sum standalone code is actually working
/*
* MIT License
*
* Copyright (c) 2023 The Regents of the University of California,
* through Lawrence Berkeley National Laboratory (subject to receipt of any
* required approvals from the U.S. Dept. of Energy).All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/*
* commons for the prefixSum codes
*/
#include <math.h>
#include <algorithm>
#include <bit>
#include <cassert>
#include <execution>
#include <iostream>
#include <vector>
#include <random>
#include <stdexec/execution.hpp>
#include <nvexec/stream_context.cuh>
using namespace std;
using namespace stdexec;
namespace ex = stdexec;
using namespace nvexec;
inline bool isPowOf2(long long int x) {
return !(x == 0) && !(x & (x - 1));
}
inline int ilog2(uint32_t x)
{
return static_cast<int>(log2(x));
}
// data type
using data_t = unsigned long long;
inline int ceilPowOf2(unsigned int v)
{
return static_cast<int>(std::bit_ceil(v));
}
template <typename T>
void genRandomVector(T *in, int N, T lower, T upper)
{
// random number generator
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<T> dist(lower, upper);
// fill random between 1 to 10
std::generate(std::execution::seq, in, in+N, [&gen,&dist]() { return dist(gen); });
}
//
// stdexec prefixSum function
//
template <typename T>
[[nodiscard]] T* prefixSum(scheduler auto &&sch, const T* in, const int N)
{
// allocate a N+1 size array as there will be a trailing zero
T *y = new T[N+1];
// number of iterations
int niters = ilog2(N);
// need to be dynamic memory to be able to use it in gpu ctx.
int *d_ptr = new int(0);
// memcpy to output vector to start computation.
ex::sync_wait(ex::schedule(sch) | ex::bulk(N, [=](int k){ y[k] = in[k]; }));
// GE Blelloch (1990) algorithm from pseudocode at:
// https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
// upsweep
for (int d = 0; d < niters; d++)
{
int bsize = N/(1 << d+1);
ex::sender auto uSweep = schedule(sch)
| ex::bulk(bsize, [=](int k) {
// stride1 = 2^(d+1)
int st1 = 1 << d+1;
// stride2 = 2^d
int st2 = 1 << d;
// only the threads at indices (k+1) * 2^(d+1) -1 will compute
int myIdx = (k+1) * st1 - 1;
// update y[myIdx]
y[myIdx] += y[myIdx-st2];
}
);
// wait for upsweep
ex::sync_wait(uSweep);
}
// write sum to y[N] and reset vars
ex::sync_wait(schedule(sch) | ex::then([=](){
y[N] = y[N-1];
y[N-1] = 0;
}));
// downsweep
for (int d = niters-1; d >= 0; d--)
{
int bsize = N/(1 << d+1);
ex::sender auto dSweep = schedule(sch)
| ex::bulk(bsize, [=](int k){
// stride1 = 2^(d+1)
int st1 = 1 << d+1;
// stride2 = 2^d
int st2 = 1 << d;
// only the threads at indices (k+1) * 2^(d+1) -1 will compute
int myIdx = (k+1) * st1 - 1;
// update y[myIdx] and y[myIdx-stride2]
auto tmp = y[myIdx];
y[myIdx] += y[myIdx-st2];
y[myIdx-st2] = tmp;
});
// wait for downsweep
ex::sync_wait(dSweep);
}
// return the computed results.
return y;
}
//
// simulation
//
int main(int argc, char* argv[])
{
// simulation variables
int N = 1e9;
if (!isPowOf2(N))
{
N = ceilPowOf2(N);
std::cout << "INFO: N != pow(2). Setting => N = " << N << std::endl;
}
// input data
data_t *in = new data_t[N];
std::cout << "Progress:0%" << std::flush;
// random number generator
genRandomVector(in, N, (data_t)0, (data_t)10);
std::cout << "..50%" << std::flush;
// output pointer
data_t *out = nullptr;
// gpu scheduler
auto gpu_sch = nvexec::stream_context().get_scheduler();
out = prefixSum(gpu_sch, in, N);
std::cout << "..100%" << std::endl << std::flush;
// return status
return 0;
}