NERSC / hpcpp

MIT License
7 stars 3 forks source link

reproduce prefix sum with stdexec #15

Open weilewei opened 7 months ago

weilewei commented 7 months ago
cmake_minimum_required(VERSION 3.22.1 FATAL_ERROR)
project(hpcpp LANGUAGES CXX CUDA)

set(CXX_STANDARD_REQUIRED ON)

# required minimum CXX standard
set(CMAKE_CXX_STANDARD_REQUIRED 23)

set(CPM_DOWNLOAD_VERSION 0.35.6)

if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
  cmake_policy(SET CMP0135 NEW)
endif()

if(CPM_SOURCE_CACHE)
  set(CPM_DOWNLOAD_LOCATION
      "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
elseif(DEFINED ENV{CPM_SOURCE_CACHE})
  set(CPM_DOWNLOAD_LOCATION
      "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
else()
  set(CPM_DOWNLOAD_LOCATION
      "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
endif()

if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
  message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
  file(
    DOWNLOAD
    https://github.com/TheLartians/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
    ${CPM_DOWNLOAD_LOCATION})
endif()

include(${CPM_DOWNLOAD_LOCATION})

cpmaddpackage(
  NAME
  stdexec
  GITHUB_REPOSITORY
  NVIDIA/stdexec
  GIT_TAG
  main# This will always pull the latest code from the `main` branch. You may
       # also use a specific release version or tag
  OPTIONS
  "STDEXEC_ENABLE_CUDA ON"
  "STDEXEC_BUILD_EXAMPLES OFF"
  "STDEXEC_BUILD_TESTS OFF"
  "STDEXEC_ENABLE_IO_URING_TESTS OFF"
  "BUILD_TESTING OFF")

set(CMAKE_CXX_FLAGS
  "${CMAKE_CXX_FLAGS} -stdpar=gpu -mp=gpu")

add_executable(main main.cpp)
target_link_libraries(main stdexec)
weilewei commented 7 months ago

the prefix sum standalone code is actually working

/*
 * MIT License
 *
 * Copyright (c) 2023 The Regents of the University of California,
 * through Lawrence Berkeley National Laboratory (subject to receipt of any
 * required approvals from the U.S. Dept. of Energy).All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * commons for the prefixSum codes
 */

#include <math.h>
#include <algorithm>
#include <bit>
#include <cassert>
#include <execution>
#include <iostream>
#include <vector>
#include <random>

#include <stdexec/execution.hpp>
#include <nvexec/stream_context.cuh>

using namespace std;
using namespace stdexec;
namespace ex = stdexec;
using namespace nvexec;

inline bool isPowOf2(long long int x) {
  return !(x == 0) && !(x & (x - 1));
}

inline int ilog2(uint32_t x)
{
    return static_cast<int>(log2(x));
}

// data type
using data_t = unsigned long long;

inline int ceilPowOf2(unsigned int v)
{
  return static_cast<int>(std::bit_ceil(v));
}

template <typename T>
void genRandomVector(T *in, int N, T lower, T upper)
{
    // random number generator
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<T> dist(lower, upper);

    // fill random between 1 to 10
    std::generate(std::execution::seq, in, in+N, [&gen,&dist]() { return dist(gen); });
}

//
// stdexec prefixSum function
//
template <typename T>
[[nodiscard]] T* prefixSum(scheduler auto &&sch, const T* in, const int N)
{
    // allocate a N+1 size array as there will be a trailing zero
    T *y = new T[N+1];

    // number of iterations
    int niters = ilog2(N);

    // need to be dynamic memory to be able to use it in gpu ctx.
    int *d_ptr = new int(0);

    // memcpy to output vector to start computation.
    ex::sync_wait(ex::schedule(sch) | ex::bulk(N, [=](int k){ y[k] = in[k]; }));

    // GE Blelloch (1990) algorithm from pseudocode at:
    // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda

    // upsweep
    for (int d = 0; d < niters; d++)
    {
        int bsize = N/(1 << d+1);

        ex::sender auto uSweep = schedule(sch)
            | ex::bulk(bsize, [=](int k) {
                // stride1 = 2^(d+1)
                int st1 = 1 << d+1;
                // stride2 = 2^d
                int st2 = 1 << d;
                // only the threads at indices (k+1) * 2^(d+1) -1 will compute
                int myIdx = (k+1) * st1 - 1;

                // update y[myIdx]
                y[myIdx] += y[myIdx-st2];
            }
        );
        // wait for upsweep
        ex::sync_wait(uSweep);
    }

    // write sum to y[N] and reset vars
    ex::sync_wait(schedule(sch) | ex::then([=](){
        y[N] = y[N-1];
        y[N-1] = 0;
    }));

    // downsweep
    for (int d = niters-1; d >= 0; d--)
    {
        int bsize = N/(1 << d+1);

        ex::sender auto dSweep = schedule(sch)
            | ex::bulk(bsize, [=](int k){
            // stride1 = 2^(d+1)
            int st1 = 1 << d+1;
            // stride2 = 2^d
            int st2 = 1 << d;
            // only the threads at indices (k+1) * 2^(d+1) -1 will compute
            int myIdx = (k+1) * st1 - 1;

            // update y[myIdx] and y[myIdx-stride2]
            auto tmp = y[myIdx];
            y[myIdx] += y[myIdx-st2];
            y[myIdx-st2] = tmp;
        });

        // wait for downsweep
        ex::sync_wait(dSweep);
    }

    // return the computed results.
    return y;
}

//
// simulation
//
int main(int argc, char* argv[])
{
    // simulation variables
    int N = 1e9;

    if (!isPowOf2(N))
    {
        N = ceilPowOf2(N);
        std::cout << "INFO: N != pow(2). Setting => N = " << N << std::endl;
    }

    // input data
    data_t *in = new data_t[N];

    std::cout << "Progress:0%" << std::flush;

    // random number generator
    genRandomVector(in, N, (data_t)0, (data_t)10);

    std::cout << "..50%" << std::flush;

    // output pointer
    data_t *out = nullptr;

    // gpu scheduler
    auto gpu_sch = nvexec::stream_context().get_scheduler();

    out = prefixSum(gpu_sch, in, N);

    std::cout << "..100%" << std::endl << std::flush;

    // return status
    return 0;
}