ComputationalRadiationPhysics / student_project_python_bindings

The student project investigates the performance and memory handling of Python bindings for CUDA C++ code created with pybind11.
GNU General Public License v3.0
1 stars 0 forks source link

Develop memory reference type, which can handle CPU and GPU memory #26

Closed SimeonEhrig closed 2 years ago

SimeonEhrig commented 2 years ago

For GPU memory, we can use Cupy_Ref and for CPU memory wen can use pybind11::array_t to pass memory between Python and C++ side. alpaka allows us to write generic code and execute it on different accelerators, only by changing a template parameter.

You task is to develop a generic Men_Ref class, which exchange CPU or GPU memory (is specified by a template parameter) between Python and C++ side.

Here is some pseudo code for the task:

tags.hpp

struct CPU;
struct CUDAGPU;

mem_ref.hpp

#include "tags.hpp"

typename<typename TAcc>
class Mem_Ref;

typename<>
class Mem_Ref<CPU>{
  using type = pybind11::array_t<float, pybind11::array::c_style>;

  // ...
};

typename<>
class Mem_Ref<GPUCUDA>{
  using type = Cupy_Ref;

  // ...
};

algo.hpp

#include "tags.hpp"
#include "mem_ref.hpp"

template<typename TDevice>
class Algo {
public:
  void whoami();

  Ref<TDevice> get_input_memory();
  void compute(Ref<TDevice> input, Ref<TDevice> output, int size);
  Ref<TDevice> get_ouput_memory();
};

template<>
class Algo<CPU> {
public:
  using TAcc = CPU;

  float * input;
  float * output;

  void whoami(){
    std::cout << "I'm the CPU version\n";
  }

  Ref<TAcc> get_input_memory(){
    return ; // input as numpy_array or similar
  }

  void compute(Ref<TAcc> input, Ref<TAcc> output, int size){
    for(int i = 0; i < size; ++i){
      output[i] = 2 * input[i]
    }
  }

  Ref<TAcc> get_output_memory(){
    return; // output as numpy_array or similar
  }
};

template<>
class Algo<CUDAGPU> {
using TAcc = CUDAGPU;
public:
  void whoami(){
    std::cout << "I'm the CUDA GPU version\n";
  }

  Ref<TAcc> get_input_memory(){
    return ; // input as cupy_ref
  }

  void compute(Ref<TAcc> input, Ref<TAcc> output, int size){
    // execute cuda kernel
  }

  Ref<TAcc> get_output_memory(){
    return; // output as cupy_ref
  }
};

binding.cu

#include "tags.hpp"
#include "mem_ref.hpp"
#include "algo.hpp"

// ...
pybind11::class_<Algo<CPU>>(m, "AlgoCPU" , ...)
pybind11::class_<Algo<GPUCUDA>>(m, "AlgoCUDA" , ...)
// ...  

main.py

# algo can be easily replace by 
# algo = binding.AlgoCPU
algo = binding.AlgoCUDA

input = algo.get_input_memory()
for i in range(10):
    input[i] = data[i]

algo.compute(algo.get_input_memory(), algo.get_output_memory())

print(algo.get_output_memory())